diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..3550a30 --- /dev/null +++ b/.envrc @@ -0,0 +1 @@ +use flake diff --git a/cmd/qvm/doctor.go b/cmd/qvm/doctor.go new file mode 100644 index 0000000..1929083 --- /dev/null +++ b/cmd/qvm/doctor.go @@ -0,0 +1,280 @@ +package main + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + "qvm/internal/config" + "qvm/internal/vm" + "qvm/internal/workspace" + "strconv" + "strings" + + "github.com/spf13/cobra" +) + +var doctorCmd = &cobra.Command{ + Use: "doctor", + Short: "Diagnose and fix common QVM issues", + Long: `Runs diagnostic checks on QVM configuration and state. + +Checks for: +- KVM availability and permissions +- Required binaries (qemu-system-x86_64, virtiofsd, sshpass, nc) +- Base image presence +- Stale state files (orphaned PID files, sockets) +- VM process state consistency + +If issues are found, provides remediation steps.`, + Run: func(cmd *cobra.Command, args []string) { + var issues []string + var fixes []string + + fmt.Println("QVM Doctor - Diagnosing issues...") + fmt.Println() + + // Check 1: KVM availability + fmt.Print("Checking KVM availability... ") + if _, err := os.Stat("/dev/kvm"); err != nil { + fmt.Println("FAIL") + issues = append(issues, "KVM not available (/dev/kvm not found)") + fixes = append(fixes, "Ensure KVM is enabled in BIOS and kvm module is loaded: modprobe kvm_intel (or kvm_amd)") + } else { + // Check KVM permissions + f, err := os.OpenFile("/dev/kvm", os.O_RDWR, 0) + if err != nil { + fmt.Println("FAIL") + issues = append(issues, fmt.Sprintf("Cannot access /dev/kvm: %v", err)) + fixes = append(fixes, "Add your user to the kvm group: sudo usermod -aG kvm $USER (then logout/login)") + } else { + f.Close() + fmt.Println("OK") + } + } + + // Check 2: Required binaries + requiredBinaries := []string{ + "qemu-system-x86_64", + "virtiofsd", + "sshpass", + "nc", + "qemu-img", + } + + for _, bin := range requiredBinaries { + fmt.Printf("Checking for %s... ", bin) + if _, err := exec.LookPath(bin); err != nil { + fmt.Println("MISSING") + issues = append(issues, fmt.Sprintf("Required binary not found: %s", bin)) + fixes = append(fixes, fmt.Sprintf("Install %s via your package manager or nix", bin)) + } else { + fmt.Println("OK") + } + } + + // Check 3: Base image + fmt.Print("Checking base image... ") + if _, err := os.Stat(config.BaseImage); err != nil { + fmt.Println("MISSING") + issues = append(issues, "Base image not found at "+config.BaseImage) + fixes = append(fixes, "Run 'qvm rebuild' to build the base image") + } else { + fmt.Println("OK") + } + + // Check 4: State file consistency + fmt.Print("Checking state files... ") + stateIssues := checkStateFiles() + if len(stateIssues) > 0 { + fmt.Println("ISSUES FOUND") + issues = append(issues, stateIssues...) + fixes = append(fixes, "Run 'qvm doctor --fix' to clean up stale state files") + } else { + fmt.Println("OK") + } + + // Check 5: virtiofsd sockets + fmt.Print("Checking virtiofsd sockets... ") + socketIssues := checkVirtiofsdSockets() + if len(socketIssues) > 0 { + fmt.Println("ISSUES FOUND") + issues = append(issues, socketIssues...) + fixes = append(fixes, "Stale sockets will be cleaned up on next start") + } else { + fmt.Println("OK") + } + + // Check 6: Workspace registry + fmt.Print("Checking workspace registry... ") + regResult := workspace.Load(config.WorkspacesFile) + if regResult.IsError() { + fmt.Println("ERROR") + issues = append(issues, fmt.Sprintf("Cannot load workspace registry: %v", regResult.Error())) + fixes = append(fixes, "The registry will be recreated on next 'qvm run'") + } else { + reg := regResult.MustGet() + workspaces := reg.List() + invalidCount := 0 + for _, ws := range workspaces { + if _, err := os.Stat(ws.HostPath); err != nil { + invalidCount++ + } + } + if invalidCount > 0 { + fmt.Printf("WARNING (%d workspaces with missing host paths)\n", invalidCount) + } else { + fmt.Printf("OK (%d workspaces registered)\n", len(workspaces)) + } + } + + fmt.Println() + + // Summary + if len(issues) == 0 { + fmt.Println("All checks passed!") + } else { + fmt.Printf("Found %d issue(s):\n", len(issues)) + fmt.Println() + for i, issue := range issues { + fmt.Printf(" %d. %s\n", i+1, issue) + if i < len(fixes) { + fmt.Printf(" Fix: %s\n", fixes[i]) + } + } + fmt.Println() + + fixFlag, _ := cmd.Flags().GetBool("fix") + if fixFlag { + fmt.Println("Applying automatic fixes...") + applyFixes() + } else { + fmt.Println("Run 'qvm doctor --fix' to apply automatic fixes where possible.") + } + } + }, +} + +func init() { + doctorCmd.Flags().Bool("fix", false, "Attempt to automatically fix issues") +} + +// checkStateFiles verifies that PID files correspond to running processes +func checkStateFiles() []string { + var issues []string + + // Check if PID file exists but process doesn't + pidBytes, err := os.ReadFile(config.PIDFile) + if err == nil { + pid, err := strconv.Atoi(strings.TrimSpace(string(pidBytes))) + if err == nil { + if !vm.IsRunning() { + issues = append(issues, fmt.Sprintf("Stale PID file: %s (process %d not running)", config.PIDFile, pid)) + } + } else { + issues = append(issues, fmt.Sprintf("Invalid PID file: %s", config.PIDFile)) + } + } + + // Check for orphaned QMP socket + if _, err := os.Stat(config.QMPSocket); err == nil && !vm.IsRunning() { + issues = append(issues, fmt.Sprintf("Orphaned QMP socket: %s", config.QMPSocket)) + } + + // Check for orphaned SSH port file + if _, err := os.Stat(config.SSHPortFile); err == nil && !vm.IsRunning() { + issues = append(issues, fmt.Sprintf("Orphaned SSH port file: %s", config.SSHPortFile)) + } + + return issues +} + +// checkVirtiofsdSockets looks for orphaned virtiofsd sockets +func checkVirtiofsdSockets() []string { + var issues []string + + pattern := filepath.Join(config.StateDir, "*.sock") + sockets, err := filepath.Glob(pattern) + if err != nil { + return issues + } + + for _, sock := range sockets { + // Skip QMP socket + if sock == config.QMPSocket { + continue + } + + // Check if corresponding virtiofsd is running + baseName := strings.TrimSuffix(filepath.Base(sock), ".sock") + pidFile := filepath.Join(config.StateDir, fmt.Sprintf("virtiofsd-%s.pid", baseName)) + + if _, err := os.Stat(pidFile); os.IsNotExist(err) { + issues = append(issues, fmt.Sprintf("Orphaned virtiofsd socket: %s (no PID file)", sock)) + continue + } + + pidBytes, err := os.ReadFile(pidFile) + if err != nil { + continue + } + + pid, err := strconv.Atoi(strings.TrimSpace(string(pidBytes))) + if err != nil { + issues = append(issues, fmt.Sprintf("Invalid virtiofsd PID file: %s", pidFile)) + continue + } + + // Check if process is running + process, err := os.FindProcess(pid) + if err != nil { + issues = append(issues, fmt.Sprintf("Orphaned virtiofsd socket: %s (process lookup failed)", sock)) + continue + } + + if err := process.Signal(os.Signal(nil)); err != nil { + issues = append(issues, fmt.Sprintf("Orphaned virtiofsd socket: %s (process %d not running)", sock, pid)) + } + } + + return issues +} + +// applyFixes attempts to clean up stale state files +func applyFixes() { + if !vm.IsRunning() { + // Clean up all state files + fmt.Println(" Cleaning up stale state files...") + + if err := os.Remove(config.PIDFile); err == nil { + fmt.Printf(" Removed: %s\n", config.PIDFile) + } + if err := os.Remove(config.SSHPortFile); err == nil { + fmt.Printf(" Removed: %s\n", config.SSHPortFile) + } + if err := os.Remove(config.QMPSocket); err == nil { + fmt.Printf(" Removed: %s\n", config.QMPSocket) + } + + // Clean up virtiofsd sockets and PID files + pattern := filepath.Join(config.StateDir, "*.sock") + sockets, _ := filepath.Glob(pattern) + for _, sock := range sockets { + if err := os.Remove(sock); err == nil { + fmt.Printf(" Removed: %s\n", sock) + } + } + + pattern = filepath.Join(config.StateDir, "virtiofsd-*.pid") + pidFiles, _ := filepath.Glob(pattern) + for _, pidFile := range pidFiles { + if err := os.Remove(pidFile); err == nil { + fmt.Printf(" Removed: %s\n", pidFile) + } + } + + fmt.Println(" Done!") + } else { + fmt.Println(" VM is running, cannot clean up state files. Stop VM first with 'qvm stop'.") + } +} diff --git a/cmd/qvm/main.go b/cmd/qvm/main.go index 8e353e7..adf280c 100644 --- a/cmd/qvm/main.go +++ b/cmd/qvm/main.go @@ -28,6 +28,7 @@ func init() { rootCmd.AddCommand(rebuildCmd) rootCmd.AddCommand(resetCmd) rootCmd.AddCommand(cleanCmd) + rootCmd.AddCommand(doctorCmd) } func main() { diff --git a/cmd/qvm/run.go b/cmd/qvm/run.go index 4f8febb..9c4a47f 100644 --- a/cmd/qvm/run.go +++ b/cmd/qvm/run.go @@ -23,7 +23,10 @@ var runCmd = &cobra.Command{ The current directory is automatically registered as a workspace and mounted into the VM. The command runs in the mounted workspace. -If no command is provided, starts an interactive zsh shell.`, +If no command is provided, starts an interactive zsh shell. + +Unlike previous versions, new workspaces can be hot-mounted into +a running VM without requiring a restart.`, Args: cobra.ArbitraryArgs, Run: func(cmd *cobra.Command, args []string) { // Default to zsh if no command provided @@ -72,32 +75,24 @@ If no command is provided, starts an interactive zsh shell.`, os.Exit(1) } } else { - statusResult := vm.Status() - if statusResult.IsError() { - logging.Error(statusResult.Error().Error()) - os.Exit(1) - } - status := statusResult.MustGet() + // Check if workspace is mounted, hot-mount if not + if !vm.IsWorkspaceMounted(ws) { + logging.Info(fmt.Sprintf("Hot-mounting workspace %s...", ws.Hash)) + hotMountResult := vm.HotMountWorkspace(ws) + if hotMountResult.IsError() { + logging.Error(fmt.Sprintf("Failed to hot-mount workspace: %v", hotMountResult.Error())) + logging.Info("Falling back to VM restart...") - checkCmd := exec.Command("sshpass", "-p", "root", "ssh", - "-o", "StrictHostKeyChecking=no", - "-o", "UserKnownHostsFile=/dev/null", - "-o", "LogLevel=ERROR", - "-p", strconv.Itoa(status.SSHPort), - "root@localhost", - fmt.Sprintf("test -d /sys/bus/virtio/drivers/9pnet_virtio/*/mount_tag && grep -q 'ws_%s' /sys/bus/virtio/drivers/9pnet_virtio/*/mount_tag 2>/dev/null", ws.Hash)) - - if checkCmd.Run() != nil { - logging.Info("Workspace not available in running VM, restarting VM...") - stopResult := vm.Stop() - if stopResult.IsError() { - logging.Error(stopResult.Error().Error()) - os.Exit(1) - } - startResult := vm.Start(cfg, reg) - if startResult.IsError() { - logging.Error(startResult.Error().Error()) - os.Exit(1) + stopResult := vm.Stop() + if stopResult.IsError() { + logging.Error(stopResult.Error().Error()) + os.Exit(1) + } + startResult := vm.Start(cfg, reg) + if startResult.IsError() { + logging.Error(startResult.Error().Error()) + os.Exit(1) + } } } } @@ -109,8 +104,11 @@ If no command is provided, starts an interactive zsh shell.`, } status := statusResult.MustGet() - remoteCmd := fmt.Sprintf("mkdir -p '%s' && (mountpoint -q '%s' || mount -t 9p ws_%s '%s' -o trans=virtio,version=9p2000.L,msize=104857600) && cd '%s' && %s", - ws.GuestPath, ws.GuestPath, ws.Hash, ws.GuestPath, ws.GuestPath, strings.Join(args, " ")) + // Use virtiofs mount instead of 9p + // virtiofs mounts use the tag name directly + remoteCmd := fmt.Sprintf( + "mkdir -p '%s' && (mountpoint -q '%s' || mount -t virtiofs %s '%s') && cd '%s' && %s", + ws.GuestPath, ws.GuestPath, ws.MountTag, ws.GuestPath, ws.GuestPath, strings.Join(args, " ")) sshArgs := []string{ "-p", "root", diff --git a/internal/lock/lock.go b/internal/lock/lock.go new file mode 100644 index 0000000..512e4eb --- /dev/null +++ b/internal/lock/lock.go @@ -0,0 +1,78 @@ +// Package lock provides file-based locking for VM operations. +package lock + +import ( + "fmt" + "os" + "path/filepath" + "syscall" + "time" + + "github.com/samber/mo" + "qvm/internal/config" +) + +// Lock represents an exclusive file lock on VM operations. +type Lock struct { + file *os.File +} + +var lockPath = filepath.Join(config.StateDir, "vm.lock") + +// Acquire obtains an exclusive lock on VM operations. +// Blocks until the lock is available or timeout is reached. +func Acquire(timeout time.Duration) mo.Result[*Lock] { + if err := os.MkdirAll(config.StateDir, 0755); err != nil { + return mo.Err[*Lock](fmt.Errorf("failed to create state directory: %w", err)) + } + + file, err := os.OpenFile(lockPath, os.O_CREATE|os.O_RDWR, 0644) + if err != nil { + return mo.Err[*Lock](fmt.Errorf("failed to open lock file: %w", err)) + } + + deadline := time.Now().Add(timeout) + for { + err := syscall.Flock(int(file.Fd()), syscall.LOCK_EX|syscall.LOCK_NB) + if err == nil { + return mo.Ok(&Lock{file: file}) + } + + if time.Now().After(deadline) { + file.Close() + return mo.Err[*Lock](fmt.Errorf("timeout waiting for VM lock after %v", timeout)) + } + + time.Sleep(100 * time.Millisecond) + } +} + +// TryAcquire attempts to obtain an exclusive lock without blocking. +// Returns error if lock is held by another process. +func TryAcquire() mo.Result[*Lock] { + if err := os.MkdirAll(config.StateDir, 0755); err != nil { + return mo.Err[*Lock](fmt.Errorf("failed to create state directory: %w", err)) + } + + file, err := os.OpenFile(lockPath, os.O_CREATE|os.O_RDWR, 0644) + if err != nil { + return mo.Err[*Lock](fmt.Errorf("failed to open lock file: %w", err)) + } + + err = syscall.Flock(int(file.Fd()), syscall.LOCK_EX|syscall.LOCK_NB) + if err != nil { + file.Close() + return mo.Err[*Lock](fmt.Errorf("VM operation in progress by another process")) + } + + return mo.Ok(&Lock{file: file}) +} + +// Release releases the lock. +func (l *Lock) Release() error { + if l.file == nil { + return nil + } + syscall.Flock(int(l.file.Fd()), syscall.LOCK_UN) + return l.file.Close() +} diff --git a/internal/qmp/client.go b/internal/qmp/client.go index 958a271..1e687e5 100644 --- a/internal/qmp/client.go +++ b/internal/qmp/client.go @@ -73,3 +73,88 @@ func (c *Client) Shutdown() mo.Result[struct{}] { func (c *Client) Close() error { return c.monitor.Disconnect() } + +// AddChardev adds a vhost-user-fs chardev for a virtiofsd socket. +// This is the first step of hot-mounting a filesystem. +func (c *Client) AddChardev(id, socketPath string) mo.Result[struct{}] { + cmd := fmt.Sprintf(`{"execute":"chardev-add","arguments":{"id":"%s","backend":{"type":"socket","data":{"addr":{"type":"unix","data":{"path":"%s"}},"server":false}}}}`, id, socketPath) + raw, err := c.monitor.Run([]byte(cmd)) + if err != nil { + return mo.Err[struct{}](fmt.Errorf("failed to add chardev %s: %w", id, err)) + } + + // Check for error in response + var resp map[string]interface{} + if err := json.Unmarshal(raw, &resp); err == nil { + if errObj, ok := resp["error"]; ok { + return mo.Err[struct{}](fmt.Errorf("QMP error adding chardev: %v", errObj)) + } + } + + return mo.Ok(struct{}{}) +} + +// AddVhostUserFsDevice adds a vhost-user-fs-pci device connected to a chardev. +// This is the second step of hot-mounting a filesystem. +func (c *Client) AddVhostUserFsDevice(chardevID, tag string) mo.Result[struct{}] { + // Device ID must be unique, derive from chardev ID + deviceID := "dev_" + chardevID + cmd := fmt.Sprintf(`{"execute":"device_add","arguments":{"driver":"vhost-user-fs-pci","chardev":"%s","tag":"%s","id":"%s","queue-size":1024}}`, chardevID, tag, deviceID) + raw, err := c.monitor.Run([]byte(cmd)) + if err != nil { + return mo.Err[struct{}](fmt.Errorf("failed to add vhost-user-fs device for %s: %w", tag, err)) + } + + // Check for error in response + var resp map[string]interface{} + if err := json.Unmarshal(raw, &resp); err == nil { + if errObj, ok := resp["error"]; ok { + return mo.Err[struct{}](fmt.Errorf("QMP error adding device: %v", errObj)) + } + } + + return mo.Ok(struct{}{}) +} + +// HotMountFilesystem performs a complete hot-mount of a virtiofsd filesystem. +// Requires the virtiofsd daemon to already be running and listening on socketPath. +func (c *Client) HotMountFilesystem(tag, socketPath string) mo.Result[struct{}] { + // Use tag as chardev ID for simplicity + chardevID := tag + + // Step 1: Add chardev + if result := c.AddChardev(chardevID, socketPath); result.IsError() { + return result + } + + // Step 2: Add device + if result := c.AddVhostUserFsDevice(chardevID, tag); result.IsError() { + return result + } + + return mo.Ok(struct{}{}) +} + +// QueryChardevs lists all current chardevs to check if one exists. +func (c *Client) QueryChardevs() mo.Result[[]string] { + cmd := []byte(`{"execute":"query-chardev"}`) + raw, err := c.monitor.Run(cmd) + if err != nil { + return mo.Err[[]string](fmt.Errorf("failed to query chardevs: %w", err)) + } + + var resp struct { + Return []struct { + Label string `json:"label"` + } `json:"return"` + } + if err := json.Unmarshal(raw, &resp); err != nil { + return mo.Err[[]string](fmt.Errorf("failed to parse chardev response: %w", err)) + } + + var labels []string + for _, c := range resp.Return { + labels = append(labels, c.Label) + } + return mo.Ok(labels) +} diff --git a/internal/vm/lifecycle.go b/internal/vm/lifecycle.go index 2c4df4b..1fb6acd 100644 --- a/internal/vm/lifecycle.go +++ b/internal/vm/lifecycle.go @@ -5,7 +5,10 @@ import ( "os" "os/exec" "qvm/internal/config" + "qvm/internal/lock" "qvm/internal/logging" + "qvm/internal/qmp" + "qvm/internal/virtiofsd" "qvm/internal/workspace" "strconv" "strings" @@ -22,45 +25,48 @@ type VMStatus struct { SSHPort int } -// Mount represents a 9p filesystem mount -type Mount struct { - Tag string - HostPath string -} - -// Start launches the VM with all configured mounts. +// Start launches the VM with all configured mounts using virtiofsd. // Sequence: -// 1. Check if VM is already running (via PID file and process check) -// 2. Ensure all required directories exist -// 3. Build mount list (cache mounts + workspace mounts from registry) -// 4. Find available SSH port -// 5. Build and start VM via runner script with 9p virtfs mounts -// 6. Write PID and SSH port to state files -// 7. Wait for SSH to become available (60 second timeout) +// 1. Acquire exclusive lock (prevents race conditions from concurrent qvm run) +// 2. Check if VM is already running (via PID file and process check) +// 3. Clean up any stale state files from previous failed starts +// 4. Ensure all required directories exist +// 5. Start virtiofsd daemons for all mounts (cache + workspaces) +// 6. Find available SSH port +// 7. Build and start QEMU with vhost-user-fs-pci devices +// 8. Write PID and SSH port to state files +// 9. Wait for SSH to become available (120 second timeout) // // Returns error if any step fails. func Start(cfg *config.Config, reg *workspace.Registry) mo.Result[struct{}] { - // 1. Check if already running + // 1. Acquire exclusive lock + lockResult := lock.Acquire(30 * time.Second) + if lockResult.IsError() { + return mo.Err[struct{}](lockResult.Error()) + } + vmLock := lockResult.MustGet() + defer vmLock.Release() + + // 2. Check if already running if IsRunning() { return mo.Err[struct{}](fmt.Errorf("VM is already running")) } - // 2. Ensure directories exist + // 3. Clean up any stale state files + cleanupStateFiles() + + // 4. Ensure directories exist if err := config.EnsureDirs(); err != nil { return mo.Err[struct{}](fmt.Errorf("failed to create directories: %w", err)) } - // 2a. Check if base image exists + // 4a. Check if base image exists if _, err := os.Stat(config.BaseImage); os.IsNotExist(err) { return mo.Err[struct{}](fmt.Errorf("base image not found at %s - run 'qvm rebuild' first", config.BaseImage)) } - // 2b. Create overlay if it doesn't exist (backed by base image) + // 4b. Create overlay if it doesn't exist (backed by base image) if _, err := os.Stat(config.Overlay); os.IsNotExist(err) { - if _, err := os.Stat(config.BaseImage); os.IsNotExist(err) { - return mo.Err[struct{}](fmt.Errorf("base image not found at %s - run 'qvm rebuild' first", config.BaseImage)) - } - logging.Info("Creating overlay image backed by base image...") cmd := exec.Command("qemu-img", "create", "-f", "qcow2", "-F", "qcow2", "-b", config.BaseImage, config.Overlay) @@ -69,46 +75,49 @@ func Start(cfg *config.Config, reg *workspace.Registry) mo.Result[struct{}] { } } - // 3. Build mount list (for 9p virtfs) - mounts := []Mount{ - {Tag: "cargo_home", HostPath: config.CargoHome}, - {Tag: "cargo_target", HostPath: config.CargoTarget}, - {Tag: "pnpm_store", HostPath: config.PnpmStore}, - {Tag: "sccache", HostPath: config.Sccache}, - } + // 5. Build mount list and start virtiofsd daemons + mounts := virtiofsd.DefaultCacheMounts() // Add opencode config mount if directory exists if _, err := os.Stat(config.HostOpencodeConfig); err == nil { - mounts = append(mounts, Mount{ - Tag: "opencode_config", - HostPath: config.HostOpencodeConfig, + mounts = append(mounts, virtiofsd.Mount{ + Tag: "opencode_config", + HostPath: config.HostOpencodeConfig, + SocketPath: config.StateDir + "/opencode_config.sock", }) } // Add workspace mounts from registry for _, ws := range reg.List() { - mounts = append(mounts, Mount{ - Tag: ws.MountTag, - HostPath: ws.HostPath, - }) + mounts = append(mounts, virtiofsd.WorkspaceMount(ws.MountTag, ws.HostPath)) } - // 4. Find available SSH port + // Start virtiofsd manager + vfsManager := virtiofsd.NewManager(config.StateDir) + startResult := vfsManager.StartAll(mounts) + if startResult.IsError() { + return mo.Err[struct{}](fmt.Errorf("failed to start virtiofsd daemons: %w", startResult.Error())) + } + + // 6. Find available SSH port sshPort, err := findAvailablePort(2222) if err != nil { + vfsManager.StopAll() return mo.Err[struct{}](fmt.Errorf("failed to find available SSH port: %w", err)) } - // 5. Build QEMU command and start VM directly - args := buildQEMUArgs(cfg, sshPort, mounts) - cmd := exec.Command("qemu-system-x86_64", args...) + // 7. Build QEMU command and start VM + args := buildQEMUCommand(cfg, sshPort, mounts) + logging.Info(fmt.Sprintf("Starting QEMU with %d mounts...", len(mounts))) + cmd := exec.Command("qemu-system-x86_64", args...) cmd.Stdout = nil cmd.Stderr = nil cmd.Stdin = nil if err := cmd.Run(); err != nil { - return mo.Err[struct{}](fmt.Errorf("failed to start QEMU: %w", err)) + vfsManager.StopAll() + return mo.Err[struct{}](fmt.Errorf("failed to start QEMU: %w\n\nTry running 'qvm doctor' to diagnose the issue.", err)) } logging.Info("Waiting for VM to daemonize...") @@ -122,11 +131,13 @@ func Start(cfg *config.Config, reg *workspace.Registry) mo.Result[struct{}] { } if !pidFileReady { - return mo.Err[struct{}](fmt.Errorf("QEMU did not create PID file after 5 seconds")) + vfsManager.StopAll() + return mo.Err[struct{}](fmt.Errorf("QEMU did not create PID file after 5 seconds\n\nTry running 'qvm doctor' to diagnose the issue.")) } pidBytes, err := os.ReadFile(config.PIDFile) if err != nil { + vfsManager.StopAll() return mo.Err[struct{}](fmt.Errorf("failed to read PID file: %w", err)) } pid := strings.TrimSpace(string(pidBytes)) @@ -134,43 +145,121 @@ func Start(cfg *config.Config, reg *workspace.Registry) mo.Result[struct{}] { logging.Info("VM started with PID " + pid) if err := os.WriteFile(config.SSHPortFile, []byte(strconv.Itoa(sshPort)), 0644); err != nil { - if pidBytes, err := os.ReadFile(config.PIDFile); err == nil { - if pid, err := strconv.Atoi(strings.TrimSpace(string(pidBytes))); err == nil { - if process, err := os.FindProcess(pid); err == nil { - _ = process.Kill() - } - } - } - _ = os.Remove(config.PIDFile) + stopQEMU() + vfsManager.StopAll() return mo.Err[struct{}](fmt.Errorf("failed to write SSH port file: %w", err)) } - // 7. Wait for SSH + // 9. Wait for SSH if err := waitForSSH(sshPort, 120*time.Second); err != nil { - _ = cmd.Process.Kill() - _ = os.Remove(config.PIDFile) - _ = os.Remove(config.SSHPortFile) - return mo.Err[struct{}](fmt.Errorf("VM started but SSH not available: %w", err)) + stopQEMU() + vfsManager.StopAll() + cleanupStateFiles() + return mo.Err[struct{}](fmt.Errorf("VM started but SSH not available: %w\n\nTry running 'qvm doctor' to diagnose the issue.", err)) } return mo.Ok(struct{}{}) } -// Stop gracefully shuts down the VM. +// HotMountWorkspace mounts a new workspace into a running VM without restart. +// This uses QMP to dynamically add a vhost-user-fs-pci device. +func HotMountWorkspace(ws *workspace.Workspace) mo.Result[struct{}] { + if !IsRunning() { + return mo.Err[struct{}](fmt.Errorf("VM is not running")) + } + + // Acquire lock to prevent concurrent hot-mounts + lockResult := lock.Acquire(10 * time.Second) + if lockResult.IsError() { + return mo.Err[struct{}](lockResult.Error()) + } + vmLock := lockResult.MustGet() + defer vmLock.Release() + + // Start virtiofsd for this workspace + mount := virtiofsd.WorkspaceMount(ws.MountTag, ws.HostPath) + vfsManager := virtiofsd.NewManager(config.StateDir) + + startResult := vfsManager.StartMount(mount) + if startResult.IsError() { + return mo.Err[struct{}](fmt.Errorf("failed to start virtiofsd: %w", startResult.Error())) + } + + // Connect to QMP and hot-add the device + qmpResult := qmp.Connect(config.QMPSocket) + if qmpResult.IsError() { + vfsManager.StopMount(mount) + return mo.Err[struct{}](fmt.Errorf("failed to connect to QMP: %w", qmpResult.Error())) + } + client := qmpResult.MustGet() + defer client.Close() + + // Hot-mount the filesystem + hotMountResult := client.HotMountFilesystem(ws.MountTag, mount.SocketPath) + if hotMountResult.IsError() { + vfsManager.StopMount(mount) + return mo.Err[struct{}](fmt.Errorf("failed to hot-mount filesystem: %w", hotMountResult.Error())) + } + + logging.Info(fmt.Sprintf("Hot-mounted workspace %s", ws.MountTag)) + return mo.Ok(struct{}{}) +} + +// IsWorkspaceMounted checks if a workspace is already mounted in the running VM. +func IsWorkspaceMounted(ws *workspace.Workspace) bool { + if !IsRunning() { + return false + } + + qmpResult := qmp.Connect(config.QMPSocket) + if qmpResult.IsError() { + return false + } + client := qmpResult.MustGet() + defer client.Close() + + chardevsResult := client.QueryChardevs() + if chardevsResult.IsError() { + return false + } + + for _, label := range chardevsResult.MustGet() { + if label == ws.MountTag { + return true + } + } + return false +} + +// Stop gracefully shuts down the VM and virtiofsd daemons. // Sequence: -// 1. Read PID from file -// 2. Send SIGTERM to the process -// 3. Wait up to 30 seconds for graceful shutdown (poll every second) -// 4. If still running, send SIGKILL -// 5. Clean up PID and port files +// 1. Acquire lock +// 2. Read PID from file +// 3. Send SIGTERM to the process +// 4. Wait up to 30 seconds for graceful shutdown +// 5. If still running, send SIGKILL +// 6. Stop all virtiofsd daemons +// 7. Clean up state files // // Returns success even if VM is not running (idempotent). func Stop() mo.Result[struct{}] { - // 1. Read PID file + // Acquire lock + lockResult := lock.Acquire(30 * time.Second) + if lockResult.IsError() { + return mo.Err[struct{}](lockResult.Error()) + } + vmLock := lockResult.MustGet() + defer vmLock.Release() + + // Stop virtiofsd daemons first + vfsManager := virtiofsd.NewManager(config.StateDir) + vfsManager.StopAll() + + // Read PID file pidBytes, err := os.ReadFile(config.PIDFile) if err != nil { if os.IsNotExist(err) { - // Not running + cleanupStateFiles() return mo.Ok(struct{}{}) } return mo.Err[struct{}](fmt.Errorf("failed to read PID file: %w", err)) @@ -178,48 +267,55 @@ func Stop() mo.Result[struct{}] { pid, err := strconv.Atoi(strings.TrimSpace(string(pidBytes))) if err != nil { + cleanupStateFiles() return mo.Err[struct{}](fmt.Errorf("invalid PID in file: %w", err)) } // Check if process exists process, err := os.FindProcess(pid) if err != nil { - // Process doesn't exist, clean up cleanupStateFiles() return mo.Ok(struct{}{}) } - // 2. Send SIGTERM for graceful shutdown + // Send SIGTERM for graceful shutdown if err := process.Signal(syscall.SIGTERM); err != nil { - // Process already gone cleanupStateFiles() return mo.Ok(struct{}{}) } - // 3. Wait up to 30 seconds for process to exit (poll every second) + // Wait up to 30 seconds for process to exit for i := 0; i < 30; i++ { time.Sleep(1 * time.Second) - - // Check if process still exists by sending signal 0 if err := process.Signal(syscall.Signal(0)); err != nil { - // Process no longer exists cleanupStateFiles() return mo.Ok(struct{}{}) } } - // 4. Timeout, force kill + // Timeout, force kill _ = process.Signal(syscall.SIGKILL) - - // Wait a moment for SIGKILL to take effect time.Sleep(1 * time.Second) - // 5. Clean up state files cleanupStateFiles() - return mo.Ok(struct{}{}) } +// stopQEMU stops just the QEMU process (used during failed starts) +func stopQEMU() { + pidBytes, err := os.ReadFile(config.PIDFile) + if err != nil { + return + } + pid, err := strconv.Atoi(strings.TrimSpace(string(pidBytes))) + if err != nil { + return + } + if process, err := os.FindProcess(pid); err == nil { + _ = process.Kill() + } +} + // cleanupStateFiles removes all VM state files func cleanupStateFiles() { _ = os.Remove(config.PIDFile) @@ -239,7 +335,6 @@ func Status() mo.Result[VMStatus] { return mo.Ok(status) } - // Read PID pidBytes, err := os.ReadFile(config.PIDFile) if err != nil { return mo.Err[VMStatus](fmt.Errorf("failed to read PID file: %w", err)) @@ -250,7 +345,6 @@ func Status() mo.Result[VMStatus] { return mo.Err[VMStatus](fmt.Errorf("invalid PID in file: %w", err)) } - // Read SSH port portBytes, err := os.ReadFile(config.SSHPortFile) if err != nil { return mo.Err[VMStatus](fmt.Errorf("failed to read SSH port file: %w", err)) @@ -269,15 +363,12 @@ func Status() mo.Result[VMStatus] { } // Reset stops the VM and deletes the overlay image. -// This returns the VM to a fresh state based on the base image. func Reset() mo.Result[struct{}] { - // Stop VM if running stopResult := Stop() if stopResult.IsError() { return mo.Err[struct{}](fmt.Errorf("failed to stop VM: %w", stopResult.Error())) } - // Delete overlay image if err := os.Remove(config.Overlay); err != nil && !os.IsNotExist(err) { return mo.Err[struct{}](fmt.Errorf("failed to delete overlay: %w", err)) } @@ -285,8 +376,7 @@ func Reset() mo.Result[struct{}] { return mo.Ok(struct{}{}) } -// IsRunning performs a quick check if the VM is running by checking -// the PID file and verifying the process exists. +// IsRunning performs a quick check if the VM is running. func IsRunning() bool { pidBytes, err := os.ReadFile(config.PIDFile) if err != nil { @@ -298,7 +388,6 @@ func IsRunning() bool { return false } - // Check if process exists by sending signal 0 process, err := os.FindProcess(pid) if err != nil { return false @@ -308,35 +397,6 @@ func IsRunning() bool { return err == nil } -func buildQEMUArgs(cfg *config.Config, sshPort int, mounts []Mount) []string { - // Boot directly from the qcow2 disk image (has GRUB installed) - // Do NOT use -kernel/-initrd - that's for NixOS VM runner which requires special 9p mounts - args := []string{ - "-machine", "q35", - "-accel", "kvm", - "-cpu", "host", - "-m", cfg.VM.Memory, - "-smp", strconv.Itoa(cfg.VM.CPUs), - "-display", "none", - "-daemonize", - "-pidfile", config.PIDFile, - "-drive", fmt.Sprintf("file=%s,if=virtio,format=qcow2", config.Overlay), - "-netdev", fmt.Sprintf("user,id=n0,hostfwd=tcp::%d-:22", sshPort), - "-device", "virtio-net-pci,netdev=n0", - "-serial", fmt.Sprintf("file:%s", config.SerialLog), - } - - // Add 9p mounts for cache directories and workspaces - for _, mount := range mounts { - args = append(args, - "-virtfs", fmt.Sprintf("local,path=%s,mount_tag=%s,security_model=mapped-xattr,id=%s", - mount.HostPath, mount.Tag, mount.Tag), - ) - } - - return args -} - // findAvailablePort finds an available TCP port starting from the given base port. func findAvailablePort(basePort int) (int, error) { const maxAttempts = 100 @@ -354,7 +414,6 @@ func findAvailablePort(basePort int) (int, error) { } // waitForSSH waits for SSH to become available on the given port. -// Uses sshpass with password 'root' to test connection. func waitForSSH(port int, timeout time.Duration) error { deadline := time.Now().Add(timeout) diff --git a/internal/vm/qemu.go b/internal/vm/qemu.go index deae525..220f52e 100644 --- a/internal/vm/qemu.go +++ b/internal/vm/qemu.go @@ -7,42 +7,30 @@ import ( "strconv" ) +// buildQEMUCommand builds the QEMU command line for virtiofsd-based mounts. +// Uses vhost-user-fs-pci devices which support hot-plugging. func buildQEMUCommand(cfg *config.Config, sshPort int, mounts []virtiofsd.Mount) []string { - args := []string{ - "qemu-system-x86_64", - "-enable-kvm", - } - memSize := cfg.VM.Memory - args = append(args, + + // vhost-user-fs requires shared memory backend + args := []string{ + "-machine", "q35", + "-accel", "kvm", + "-cpu", "host", "-object", fmt.Sprintf("memory-backend-memfd,id=mem,size=%s,share=on", memSize), "-numa", "node,memdev=mem", - ) - - args = append(args, "-smp", strconv.Itoa(cfg.VM.CPUs), - ) - - args = append(args, - "-drive", fmt.Sprintf("if=virtio,file=%s,format=qcow2", config.Overlay), - ) - - args = append(args, - "-nic", fmt.Sprintf("user,model=virtio-net-pci,hostfwd=tcp::%d-:22", sshPort), - ) - - args = append(args, - "-serial", fmt.Sprintf("file:%s", config.SerialLog), - ) - - args = append(args, - "-qmp", fmt.Sprintf("unix:%s,server,nowait", config.QMPSocket), - ) - - args = append(args, "-display", "none", - ) + "-daemonize", + "-pidfile", config.PIDFile, + "-drive", fmt.Sprintf("file=%s,if=virtio,format=qcow2", config.Overlay), + "-netdev", fmt.Sprintf("user,id=n0,hostfwd=tcp::%d-:22", sshPort), + "-device", "virtio-net-pci,netdev=n0", + "-serial", fmt.Sprintf("file:%s", config.SerialLog), + "-qmp", fmt.Sprintf("unix:%s,server,nowait", config.QMPSocket), + } + // Add vhost-user-fs devices for each mount for _, mount := range mounts { args = append(args, "-chardev", fmt.Sprintf("socket,id=%s,path=%s", mount.Tag, mount.SocketPath),