Add qvm doctor command to diagnose and fix common issues

This commit is contained in:
Joshua Bell 2026-01-27 11:54:26 -06:00
parent 2aec01b3b2
commit eb469f1cd8
8 changed files with 660 additions and 170 deletions

1
.envrc Normal file
View file

@ -0,0 +1 @@
use flake

280
cmd/qvm/doctor.go Normal file
View file

@ -0,0 +1,280 @@
package main
import (
"fmt"
"os"
"os/exec"
"path/filepath"
"qvm/internal/config"
"qvm/internal/vm"
"qvm/internal/workspace"
"strconv"
"strings"
"github.com/spf13/cobra"
)
var doctorCmd = &cobra.Command{
Use: "doctor",
Short: "Diagnose and fix common QVM issues",
Long: `Runs diagnostic checks on QVM configuration and state.
Checks for:
- KVM availability and permissions
- Required binaries (qemu-system-x86_64, virtiofsd, sshpass, nc)
- Base image presence
- Stale state files (orphaned PID files, sockets)
- VM process state consistency
If issues are found, provides remediation steps.`,
Run: func(cmd *cobra.Command, args []string) {
var issues []string
var fixes []string
fmt.Println("QVM Doctor - Diagnosing issues...")
fmt.Println()
// Check 1: KVM availability
fmt.Print("Checking KVM availability... ")
if _, err := os.Stat("/dev/kvm"); err != nil {
fmt.Println("FAIL")
issues = append(issues, "KVM not available (/dev/kvm not found)")
fixes = append(fixes, "Ensure KVM is enabled in BIOS and kvm module is loaded: modprobe kvm_intel (or kvm_amd)")
} else {
// Check KVM permissions
f, err := os.OpenFile("/dev/kvm", os.O_RDWR, 0)
if err != nil {
fmt.Println("FAIL")
issues = append(issues, fmt.Sprintf("Cannot access /dev/kvm: %v", err))
fixes = append(fixes, "Add your user to the kvm group: sudo usermod -aG kvm $USER (then logout/login)")
} else {
f.Close()
fmt.Println("OK")
}
}
// Check 2: Required binaries
requiredBinaries := []string{
"qemu-system-x86_64",
"virtiofsd",
"sshpass",
"nc",
"qemu-img",
}
for _, bin := range requiredBinaries {
fmt.Printf("Checking for %s... ", bin)
if _, err := exec.LookPath(bin); err != nil {
fmt.Println("MISSING")
issues = append(issues, fmt.Sprintf("Required binary not found: %s", bin))
fixes = append(fixes, fmt.Sprintf("Install %s via your package manager or nix", bin))
} else {
fmt.Println("OK")
}
}
// Check 3: Base image
fmt.Print("Checking base image... ")
if _, err := os.Stat(config.BaseImage); err != nil {
fmt.Println("MISSING")
issues = append(issues, "Base image not found at "+config.BaseImage)
fixes = append(fixes, "Run 'qvm rebuild' to build the base image")
} else {
fmt.Println("OK")
}
// Check 4: State file consistency
fmt.Print("Checking state files... ")
stateIssues := checkStateFiles()
if len(stateIssues) > 0 {
fmt.Println("ISSUES FOUND")
issues = append(issues, stateIssues...)
fixes = append(fixes, "Run 'qvm doctor --fix' to clean up stale state files")
} else {
fmt.Println("OK")
}
// Check 5: virtiofsd sockets
fmt.Print("Checking virtiofsd sockets... ")
socketIssues := checkVirtiofsdSockets()
if len(socketIssues) > 0 {
fmt.Println("ISSUES FOUND")
issues = append(issues, socketIssues...)
fixes = append(fixes, "Stale sockets will be cleaned up on next start")
} else {
fmt.Println("OK")
}
// Check 6: Workspace registry
fmt.Print("Checking workspace registry... ")
regResult := workspace.Load(config.WorkspacesFile)
if regResult.IsError() {
fmt.Println("ERROR")
issues = append(issues, fmt.Sprintf("Cannot load workspace registry: %v", regResult.Error()))
fixes = append(fixes, "The registry will be recreated on next 'qvm run'")
} else {
reg := regResult.MustGet()
workspaces := reg.List()
invalidCount := 0
for _, ws := range workspaces {
if _, err := os.Stat(ws.HostPath); err != nil {
invalidCount++
}
}
if invalidCount > 0 {
fmt.Printf("WARNING (%d workspaces with missing host paths)\n", invalidCount)
} else {
fmt.Printf("OK (%d workspaces registered)\n", len(workspaces))
}
}
fmt.Println()
// Summary
if len(issues) == 0 {
fmt.Println("All checks passed!")
} else {
fmt.Printf("Found %d issue(s):\n", len(issues))
fmt.Println()
for i, issue := range issues {
fmt.Printf(" %d. %s\n", i+1, issue)
if i < len(fixes) {
fmt.Printf(" Fix: %s\n", fixes[i])
}
}
fmt.Println()
fixFlag, _ := cmd.Flags().GetBool("fix")
if fixFlag {
fmt.Println("Applying automatic fixes...")
applyFixes()
} else {
fmt.Println("Run 'qvm doctor --fix' to apply automatic fixes where possible.")
}
}
},
}
func init() {
doctorCmd.Flags().Bool("fix", false, "Attempt to automatically fix issues")
}
// checkStateFiles verifies that PID files correspond to running processes
func checkStateFiles() []string {
var issues []string
// Check if PID file exists but process doesn't
pidBytes, err := os.ReadFile(config.PIDFile)
if err == nil {
pid, err := strconv.Atoi(strings.TrimSpace(string(pidBytes)))
if err == nil {
if !vm.IsRunning() {
issues = append(issues, fmt.Sprintf("Stale PID file: %s (process %d not running)", config.PIDFile, pid))
}
} else {
issues = append(issues, fmt.Sprintf("Invalid PID file: %s", config.PIDFile))
}
}
// Check for orphaned QMP socket
if _, err := os.Stat(config.QMPSocket); err == nil && !vm.IsRunning() {
issues = append(issues, fmt.Sprintf("Orphaned QMP socket: %s", config.QMPSocket))
}
// Check for orphaned SSH port file
if _, err := os.Stat(config.SSHPortFile); err == nil && !vm.IsRunning() {
issues = append(issues, fmt.Sprintf("Orphaned SSH port file: %s", config.SSHPortFile))
}
return issues
}
// checkVirtiofsdSockets looks for orphaned virtiofsd sockets
func checkVirtiofsdSockets() []string {
var issues []string
pattern := filepath.Join(config.StateDir, "*.sock")
sockets, err := filepath.Glob(pattern)
if err != nil {
return issues
}
for _, sock := range sockets {
// Skip QMP socket
if sock == config.QMPSocket {
continue
}
// Check if corresponding virtiofsd is running
baseName := strings.TrimSuffix(filepath.Base(sock), ".sock")
pidFile := filepath.Join(config.StateDir, fmt.Sprintf("virtiofsd-%s.pid", baseName))
if _, err := os.Stat(pidFile); os.IsNotExist(err) {
issues = append(issues, fmt.Sprintf("Orphaned virtiofsd socket: %s (no PID file)", sock))
continue
}
pidBytes, err := os.ReadFile(pidFile)
if err != nil {
continue
}
pid, err := strconv.Atoi(strings.TrimSpace(string(pidBytes)))
if err != nil {
issues = append(issues, fmt.Sprintf("Invalid virtiofsd PID file: %s", pidFile))
continue
}
// Check if process is running
process, err := os.FindProcess(pid)
if err != nil {
issues = append(issues, fmt.Sprintf("Orphaned virtiofsd socket: %s (process lookup failed)", sock))
continue
}
if err := process.Signal(os.Signal(nil)); err != nil {
issues = append(issues, fmt.Sprintf("Orphaned virtiofsd socket: %s (process %d not running)", sock, pid))
}
}
return issues
}
// applyFixes attempts to clean up stale state files
func applyFixes() {
if !vm.IsRunning() {
// Clean up all state files
fmt.Println(" Cleaning up stale state files...")
if err := os.Remove(config.PIDFile); err == nil {
fmt.Printf(" Removed: %s\n", config.PIDFile)
}
if err := os.Remove(config.SSHPortFile); err == nil {
fmt.Printf(" Removed: %s\n", config.SSHPortFile)
}
if err := os.Remove(config.QMPSocket); err == nil {
fmt.Printf(" Removed: %s\n", config.QMPSocket)
}
// Clean up virtiofsd sockets and PID files
pattern := filepath.Join(config.StateDir, "*.sock")
sockets, _ := filepath.Glob(pattern)
for _, sock := range sockets {
if err := os.Remove(sock); err == nil {
fmt.Printf(" Removed: %s\n", sock)
}
}
pattern = filepath.Join(config.StateDir, "virtiofsd-*.pid")
pidFiles, _ := filepath.Glob(pattern)
for _, pidFile := range pidFiles {
if err := os.Remove(pidFile); err == nil {
fmt.Printf(" Removed: %s\n", pidFile)
}
}
fmt.Println(" Done!")
} else {
fmt.Println(" VM is running, cannot clean up state files. Stop VM first with 'qvm stop'.")
}
}

View file

@ -28,6 +28,7 @@ func init() {
rootCmd.AddCommand(rebuildCmd) rootCmd.AddCommand(rebuildCmd)
rootCmd.AddCommand(resetCmd) rootCmd.AddCommand(resetCmd)
rootCmd.AddCommand(cleanCmd) rootCmd.AddCommand(cleanCmd)
rootCmd.AddCommand(doctorCmd)
} }
func main() { func main() {

View file

@ -23,7 +23,10 @@ var runCmd = &cobra.Command{
The current directory is automatically registered as a workspace The current directory is automatically registered as a workspace
and mounted into the VM. The command runs in the mounted workspace. and mounted into the VM. The command runs in the mounted workspace.
If no command is provided, starts an interactive zsh shell.`, If no command is provided, starts an interactive zsh shell.
Unlike previous versions, new workspaces can be hot-mounted into
a running VM without requiring a restart.`,
Args: cobra.ArbitraryArgs, Args: cobra.ArbitraryArgs,
Run: func(cmd *cobra.Command, args []string) { Run: func(cmd *cobra.Command, args []string) {
// Default to zsh if no command provided // Default to zsh if no command provided
@ -72,32 +75,24 @@ If no command is provided, starts an interactive zsh shell.`,
os.Exit(1) os.Exit(1)
} }
} else { } else {
statusResult := vm.Status() // Check if workspace is mounted, hot-mount if not
if statusResult.IsError() { if !vm.IsWorkspaceMounted(ws) {
logging.Error(statusResult.Error().Error()) logging.Info(fmt.Sprintf("Hot-mounting workspace %s...", ws.Hash))
os.Exit(1) hotMountResult := vm.HotMountWorkspace(ws)
} if hotMountResult.IsError() {
status := statusResult.MustGet() logging.Error(fmt.Sprintf("Failed to hot-mount workspace: %v", hotMountResult.Error()))
logging.Info("Falling back to VM restart...")
checkCmd := exec.Command("sshpass", "-p", "root", "ssh", stopResult := vm.Stop()
"-o", "StrictHostKeyChecking=no", if stopResult.IsError() {
"-o", "UserKnownHostsFile=/dev/null", logging.Error(stopResult.Error().Error())
"-o", "LogLevel=ERROR", os.Exit(1)
"-p", strconv.Itoa(status.SSHPort), }
"root@localhost", startResult := vm.Start(cfg, reg)
fmt.Sprintf("test -d /sys/bus/virtio/drivers/9pnet_virtio/*/mount_tag && grep -q 'ws_%s' /sys/bus/virtio/drivers/9pnet_virtio/*/mount_tag 2>/dev/null", ws.Hash)) if startResult.IsError() {
logging.Error(startResult.Error().Error())
if checkCmd.Run() != nil { os.Exit(1)
logging.Info("Workspace not available in running VM, restarting VM...") }
stopResult := vm.Stop()
if stopResult.IsError() {
logging.Error(stopResult.Error().Error())
os.Exit(1)
}
startResult := vm.Start(cfg, reg)
if startResult.IsError() {
logging.Error(startResult.Error().Error())
os.Exit(1)
} }
} }
} }
@ -109,8 +104,11 @@ If no command is provided, starts an interactive zsh shell.`,
} }
status := statusResult.MustGet() status := statusResult.MustGet()
remoteCmd := fmt.Sprintf("mkdir -p '%s' && (mountpoint -q '%s' || mount -t 9p ws_%s '%s' -o trans=virtio,version=9p2000.L,msize=104857600) && cd '%s' && %s", // Use virtiofs mount instead of 9p
ws.GuestPath, ws.GuestPath, ws.Hash, ws.GuestPath, ws.GuestPath, strings.Join(args, " ")) // virtiofs mounts use the tag name directly
remoteCmd := fmt.Sprintf(
"mkdir -p '%s' && (mountpoint -q '%s' || mount -t virtiofs %s '%s') && cd '%s' && %s",
ws.GuestPath, ws.GuestPath, ws.MountTag, ws.GuestPath, ws.GuestPath, strings.Join(args, " "))
sshArgs := []string{ sshArgs := []string{
"-p", "root", "-p", "root",

78
internal/lock/lock.go Normal file
View file

@ -0,0 +1,78 @@
// Package lock provides file-based locking for VM operations.
package lock
import (
"fmt"
"os"
"path/filepath"
"syscall"
"time"
"github.com/samber/mo"
"qvm/internal/config"
)
// Lock represents an exclusive file lock on VM operations.
type Lock struct {
file *os.File
}
var lockPath = filepath.Join(config.StateDir, "vm.lock")
// Acquire obtains an exclusive lock on VM operations.
// Blocks until the lock is available or timeout is reached.
func Acquire(timeout time.Duration) mo.Result[*Lock] {
if err := os.MkdirAll(config.StateDir, 0755); err != nil {
return mo.Err[*Lock](fmt.Errorf("failed to create state directory: %w", err))
}
file, err := os.OpenFile(lockPath, os.O_CREATE|os.O_RDWR, 0644)
if err != nil {
return mo.Err[*Lock](fmt.Errorf("failed to open lock file: %w", err))
}
deadline := time.Now().Add(timeout)
for {
err := syscall.Flock(int(file.Fd()), syscall.LOCK_EX|syscall.LOCK_NB)
if err == nil {
return mo.Ok(&Lock{file: file})
}
if time.Now().After(deadline) {
file.Close()
return mo.Err[*Lock](fmt.Errorf("timeout waiting for VM lock after %v", timeout))
}
time.Sleep(100 * time.Millisecond)
}
}
// TryAcquire attempts to obtain an exclusive lock without blocking.
// Returns error if lock is held by another process.
func TryAcquire() mo.Result[*Lock] {
if err := os.MkdirAll(config.StateDir, 0755); err != nil {
return mo.Err[*Lock](fmt.Errorf("failed to create state directory: %w", err))
}
file, err := os.OpenFile(lockPath, os.O_CREATE|os.O_RDWR, 0644)
if err != nil {
return mo.Err[*Lock](fmt.Errorf("failed to open lock file: %w", err))
}
err = syscall.Flock(int(file.Fd()), syscall.LOCK_EX|syscall.LOCK_NB)
if err != nil {
file.Close()
return mo.Err[*Lock](fmt.Errorf("VM operation in progress by another process"))
}
return mo.Ok(&Lock{file: file})
}
// Release releases the lock.
func (l *Lock) Release() error {
if l.file == nil {
return nil
}
syscall.Flock(int(l.file.Fd()), syscall.LOCK_UN)
return l.file.Close()
}

View file

@ -73,3 +73,88 @@ func (c *Client) Shutdown() mo.Result[struct{}] {
func (c *Client) Close() error { func (c *Client) Close() error {
return c.monitor.Disconnect() return c.monitor.Disconnect()
} }
// AddChardev adds a vhost-user-fs chardev for a virtiofsd socket.
// This is the first step of hot-mounting a filesystem.
func (c *Client) AddChardev(id, socketPath string) mo.Result[struct{}] {
cmd := fmt.Sprintf(`{"execute":"chardev-add","arguments":{"id":"%s","backend":{"type":"socket","data":{"addr":{"type":"unix","data":{"path":"%s"}},"server":false}}}}`, id, socketPath)
raw, err := c.monitor.Run([]byte(cmd))
if err != nil {
return mo.Err[struct{}](fmt.Errorf("failed to add chardev %s: %w", id, err))
}
// Check for error in response
var resp map[string]interface{}
if err := json.Unmarshal(raw, &resp); err == nil {
if errObj, ok := resp["error"]; ok {
return mo.Err[struct{}](fmt.Errorf("QMP error adding chardev: %v", errObj))
}
}
return mo.Ok(struct{}{})
}
// AddVhostUserFsDevice adds a vhost-user-fs-pci device connected to a chardev.
// This is the second step of hot-mounting a filesystem.
func (c *Client) AddVhostUserFsDevice(chardevID, tag string) mo.Result[struct{}] {
// Device ID must be unique, derive from chardev ID
deviceID := "dev_" + chardevID
cmd := fmt.Sprintf(`{"execute":"device_add","arguments":{"driver":"vhost-user-fs-pci","chardev":"%s","tag":"%s","id":"%s","queue-size":1024}}`, chardevID, tag, deviceID)
raw, err := c.monitor.Run([]byte(cmd))
if err != nil {
return mo.Err[struct{}](fmt.Errorf("failed to add vhost-user-fs device for %s: %w", tag, err))
}
// Check for error in response
var resp map[string]interface{}
if err := json.Unmarshal(raw, &resp); err == nil {
if errObj, ok := resp["error"]; ok {
return mo.Err[struct{}](fmt.Errorf("QMP error adding device: %v", errObj))
}
}
return mo.Ok(struct{}{})
}
// HotMountFilesystem performs a complete hot-mount of a virtiofsd filesystem.
// Requires the virtiofsd daemon to already be running and listening on socketPath.
func (c *Client) HotMountFilesystem(tag, socketPath string) mo.Result[struct{}] {
// Use tag as chardev ID for simplicity
chardevID := tag
// Step 1: Add chardev
if result := c.AddChardev(chardevID, socketPath); result.IsError() {
return result
}
// Step 2: Add device
if result := c.AddVhostUserFsDevice(chardevID, tag); result.IsError() {
return result
}
return mo.Ok(struct{}{})
}
// QueryChardevs lists all current chardevs to check if one exists.
func (c *Client) QueryChardevs() mo.Result[[]string] {
cmd := []byte(`{"execute":"query-chardev"}`)
raw, err := c.monitor.Run(cmd)
if err != nil {
return mo.Err[[]string](fmt.Errorf("failed to query chardevs: %w", err))
}
var resp struct {
Return []struct {
Label string `json:"label"`
} `json:"return"`
}
if err := json.Unmarshal(raw, &resp); err != nil {
return mo.Err[[]string](fmt.Errorf("failed to parse chardev response: %w", err))
}
var labels []string
for _, c := range resp.Return {
labels = append(labels, c.Label)
}
return mo.Ok(labels)
}

View file

@ -5,7 +5,10 @@ import (
"os" "os"
"os/exec" "os/exec"
"qvm/internal/config" "qvm/internal/config"
"qvm/internal/lock"
"qvm/internal/logging" "qvm/internal/logging"
"qvm/internal/qmp"
"qvm/internal/virtiofsd"
"qvm/internal/workspace" "qvm/internal/workspace"
"strconv" "strconv"
"strings" "strings"
@ -22,45 +25,48 @@ type VMStatus struct {
SSHPort int SSHPort int
} }
// Mount represents a 9p filesystem mount // Start launches the VM with all configured mounts using virtiofsd.
type Mount struct {
Tag string
HostPath string
}
// Start launches the VM with all configured mounts.
// Sequence: // Sequence:
// 1. Check if VM is already running (via PID file and process check) // 1. Acquire exclusive lock (prevents race conditions from concurrent qvm run)
// 2. Ensure all required directories exist // 2. Check if VM is already running (via PID file and process check)
// 3. Build mount list (cache mounts + workspace mounts from registry) // 3. Clean up any stale state files from previous failed starts
// 4. Find available SSH port // 4. Ensure all required directories exist
// 5. Build and start VM via runner script with 9p virtfs mounts // 5. Start virtiofsd daemons for all mounts (cache + workspaces)
// 6. Write PID and SSH port to state files // 6. Find available SSH port
// 7. Wait for SSH to become available (60 second timeout) // 7. Build and start QEMU with vhost-user-fs-pci devices
// 8. Write PID and SSH port to state files
// 9. Wait for SSH to become available (120 second timeout)
// //
// Returns error if any step fails. // Returns error if any step fails.
func Start(cfg *config.Config, reg *workspace.Registry) mo.Result[struct{}] { func Start(cfg *config.Config, reg *workspace.Registry) mo.Result[struct{}] {
// 1. Check if already running // 1. Acquire exclusive lock
lockResult := lock.Acquire(30 * time.Second)
if lockResult.IsError() {
return mo.Err[struct{}](lockResult.Error())
}
vmLock := lockResult.MustGet()
defer vmLock.Release()
// 2. Check if already running
if IsRunning() { if IsRunning() {
return mo.Err[struct{}](fmt.Errorf("VM is already running")) return mo.Err[struct{}](fmt.Errorf("VM is already running"))
} }
// 2. Ensure directories exist // 3. Clean up any stale state files
cleanupStateFiles()
// 4. Ensure directories exist
if err := config.EnsureDirs(); err != nil { if err := config.EnsureDirs(); err != nil {
return mo.Err[struct{}](fmt.Errorf("failed to create directories: %w", err)) return mo.Err[struct{}](fmt.Errorf("failed to create directories: %w", err))
} }
// 2a. Check if base image exists // 4a. Check if base image exists
if _, err := os.Stat(config.BaseImage); os.IsNotExist(err) { if _, err := os.Stat(config.BaseImage); os.IsNotExist(err) {
return mo.Err[struct{}](fmt.Errorf("base image not found at %s - run 'qvm rebuild' first", config.BaseImage)) return mo.Err[struct{}](fmt.Errorf("base image not found at %s - run 'qvm rebuild' first", config.BaseImage))
} }
// 2b. Create overlay if it doesn't exist (backed by base image) // 4b. Create overlay if it doesn't exist (backed by base image)
if _, err := os.Stat(config.Overlay); os.IsNotExist(err) { if _, err := os.Stat(config.Overlay); os.IsNotExist(err) {
if _, err := os.Stat(config.BaseImage); os.IsNotExist(err) {
return mo.Err[struct{}](fmt.Errorf("base image not found at %s - run 'qvm rebuild' first", config.BaseImage))
}
logging.Info("Creating overlay image backed by base image...") logging.Info("Creating overlay image backed by base image...")
cmd := exec.Command("qemu-img", "create", "-f", "qcow2", cmd := exec.Command("qemu-img", "create", "-f", "qcow2",
"-F", "qcow2", "-b", config.BaseImage, config.Overlay) "-F", "qcow2", "-b", config.BaseImage, config.Overlay)
@ -69,46 +75,49 @@ func Start(cfg *config.Config, reg *workspace.Registry) mo.Result[struct{}] {
} }
} }
// 3. Build mount list (for 9p virtfs) // 5. Build mount list and start virtiofsd daemons
mounts := []Mount{ mounts := virtiofsd.DefaultCacheMounts()
{Tag: "cargo_home", HostPath: config.CargoHome},
{Tag: "cargo_target", HostPath: config.CargoTarget},
{Tag: "pnpm_store", HostPath: config.PnpmStore},
{Tag: "sccache", HostPath: config.Sccache},
}
// Add opencode config mount if directory exists // Add opencode config mount if directory exists
if _, err := os.Stat(config.HostOpencodeConfig); err == nil { if _, err := os.Stat(config.HostOpencodeConfig); err == nil {
mounts = append(mounts, Mount{ mounts = append(mounts, virtiofsd.Mount{
Tag: "opencode_config", Tag: "opencode_config",
HostPath: config.HostOpencodeConfig, HostPath: config.HostOpencodeConfig,
SocketPath: config.StateDir + "/opencode_config.sock",
}) })
} }
// Add workspace mounts from registry // Add workspace mounts from registry
for _, ws := range reg.List() { for _, ws := range reg.List() {
mounts = append(mounts, Mount{ mounts = append(mounts, virtiofsd.WorkspaceMount(ws.MountTag, ws.HostPath))
Tag: ws.MountTag,
HostPath: ws.HostPath,
})
} }
// 4. Find available SSH port // Start virtiofsd manager
vfsManager := virtiofsd.NewManager(config.StateDir)
startResult := vfsManager.StartAll(mounts)
if startResult.IsError() {
return mo.Err[struct{}](fmt.Errorf("failed to start virtiofsd daemons: %w", startResult.Error()))
}
// 6. Find available SSH port
sshPort, err := findAvailablePort(2222) sshPort, err := findAvailablePort(2222)
if err != nil { if err != nil {
vfsManager.StopAll()
return mo.Err[struct{}](fmt.Errorf("failed to find available SSH port: %w", err)) return mo.Err[struct{}](fmt.Errorf("failed to find available SSH port: %w", err))
} }
// 5. Build QEMU command and start VM directly // 7. Build QEMU command and start VM
args := buildQEMUArgs(cfg, sshPort, mounts) args := buildQEMUCommand(cfg, sshPort, mounts)
cmd := exec.Command("qemu-system-x86_64", args...) logging.Info(fmt.Sprintf("Starting QEMU with %d mounts...", len(mounts)))
cmd := exec.Command("qemu-system-x86_64", args...)
cmd.Stdout = nil cmd.Stdout = nil
cmd.Stderr = nil cmd.Stderr = nil
cmd.Stdin = nil cmd.Stdin = nil
if err := cmd.Run(); err != nil { if err := cmd.Run(); err != nil {
return mo.Err[struct{}](fmt.Errorf("failed to start QEMU: %w", err)) vfsManager.StopAll()
return mo.Err[struct{}](fmt.Errorf("failed to start QEMU: %w\n\nTry running 'qvm doctor' to diagnose the issue.", err))
} }
logging.Info("Waiting for VM to daemonize...") logging.Info("Waiting for VM to daemonize...")
@ -122,11 +131,13 @@ func Start(cfg *config.Config, reg *workspace.Registry) mo.Result[struct{}] {
} }
if !pidFileReady { if !pidFileReady {
return mo.Err[struct{}](fmt.Errorf("QEMU did not create PID file after 5 seconds")) vfsManager.StopAll()
return mo.Err[struct{}](fmt.Errorf("QEMU did not create PID file after 5 seconds\n\nTry running 'qvm doctor' to diagnose the issue."))
} }
pidBytes, err := os.ReadFile(config.PIDFile) pidBytes, err := os.ReadFile(config.PIDFile)
if err != nil { if err != nil {
vfsManager.StopAll()
return mo.Err[struct{}](fmt.Errorf("failed to read PID file: %w", err)) return mo.Err[struct{}](fmt.Errorf("failed to read PID file: %w", err))
} }
pid := strings.TrimSpace(string(pidBytes)) pid := strings.TrimSpace(string(pidBytes))
@ -134,43 +145,121 @@ func Start(cfg *config.Config, reg *workspace.Registry) mo.Result[struct{}] {
logging.Info("VM started with PID " + pid) logging.Info("VM started with PID " + pid)
if err := os.WriteFile(config.SSHPortFile, []byte(strconv.Itoa(sshPort)), 0644); err != nil { if err := os.WriteFile(config.SSHPortFile, []byte(strconv.Itoa(sshPort)), 0644); err != nil {
if pidBytes, err := os.ReadFile(config.PIDFile); err == nil { stopQEMU()
if pid, err := strconv.Atoi(strings.TrimSpace(string(pidBytes))); err == nil { vfsManager.StopAll()
if process, err := os.FindProcess(pid); err == nil {
_ = process.Kill()
}
}
}
_ = os.Remove(config.PIDFile)
return mo.Err[struct{}](fmt.Errorf("failed to write SSH port file: %w", err)) return mo.Err[struct{}](fmt.Errorf("failed to write SSH port file: %w", err))
} }
// 7. Wait for SSH // 9. Wait for SSH
if err := waitForSSH(sshPort, 120*time.Second); err != nil { if err := waitForSSH(sshPort, 120*time.Second); err != nil {
_ = cmd.Process.Kill() stopQEMU()
_ = os.Remove(config.PIDFile) vfsManager.StopAll()
_ = os.Remove(config.SSHPortFile) cleanupStateFiles()
return mo.Err[struct{}](fmt.Errorf("VM started but SSH not available: %w", err)) return mo.Err[struct{}](fmt.Errorf("VM started but SSH not available: %w\n\nTry running 'qvm doctor' to diagnose the issue.", err))
} }
return mo.Ok(struct{}{}) return mo.Ok(struct{}{})
} }
// Stop gracefully shuts down the VM. // HotMountWorkspace mounts a new workspace into a running VM without restart.
// This uses QMP to dynamically add a vhost-user-fs-pci device.
func HotMountWorkspace(ws *workspace.Workspace) mo.Result[struct{}] {
if !IsRunning() {
return mo.Err[struct{}](fmt.Errorf("VM is not running"))
}
// Acquire lock to prevent concurrent hot-mounts
lockResult := lock.Acquire(10 * time.Second)
if lockResult.IsError() {
return mo.Err[struct{}](lockResult.Error())
}
vmLock := lockResult.MustGet()
defer vmLock.Release()
// Start virtiofsd for this workspace
mount := virtiofsd.WorkspaceMount(ws.MountTag, ws.HostPath)
vfsManager := virtiofsd.NewManager(config.StateDir)
startResult := vfsManager.StartMount(mount)
if startResult.IsError() {
return mo.Err[struct{}](fmt.Errorf("failed to start virtiofsd: %w", startResult.Error()))
}
// Connect to QMP and hot-add the device
qmpResult := qmp.Connect(config.QMPSocket)
if qmpResult.IsError() {
vfsManager.StopMount(mount)
return mo.Err[struct{}](fmt.Errorf("failed to connect to QMP: %w", qmpResult.Error()))
}
client := qmpResult.MustGet()
defer client.Close()
// Hot-mount the filesystem
hotMountResult := client.HotMountFilesystem(ws.MountTag, mount.SocketPath)
if hotMountResult.IsError() {
vfsManager.StopMount(mount)
return mo.Err[struct{}](fmt.Errorf("failed to hot-mount filesystem: %w", hotMountResult.Error()))
}
logging.Info(fmt.Sprintf("Hot-mounted workspace %s", ws.MountTag))
return mo.Ok(struct{}{})
}
// IsWorkspaceMounted checks if a workspace is already mounted in the running VM.
func IsWorkspaceMounted(ws *workspace.Workspace) bool {
if !IsRunning() {
return false
}
qmpResult := qmp.Connect(config.QMPSocket)
if qmpResult.IsError() {
return false
}
client := qmpResult.MustGet()
defer client.Close()
chardevsResult := client.QueryChardevs()
if chardevsResult.IsError() {
return false
}
for _, label := range chardevsResult.MustGet() {
if label == ws.MountTag {
return true
}
}
return false
}
// Stop gracefully shuts down the VM and virtiofsd daemons.
// Sequence: // Sequence:
// 1. Read PID from file // 1. Acquire lock
// 2. Send SIGTERM to the process // 2. Read PID from file
// 3. Wait up to 30 seconds for graceful shutdown (poll every second) // 3. Send SIGTERM to the process
// 4. If still running, send SIGKILL // 4. Wait up to 30 seconds for graceful shutdown
// 5. Clean up PID and port files // 5. If still running, send SIGKILL
// 6. Stop all virtiofsd daemons
// 7. Clean up state files
// //
// Returns success even if VM is not running (idempotent). // Returns success even if VM is not running (idempotent).
func Stop() mo.Result[struct{}] { func Stop() mo.Result[struct{}] {
// 1. Read PID file // Acquire lock
lockResult := lock.Acquire(30 * time.Second)
if lockResult.IsError() {
return mo.Err[struct{}](lockResult.Error())
}
vmLock := lockResult.MustGet()
defer vmLock.Release()
// Stop virtiofsd daemons first
vfsManager := virtiofsd.NewManager(config.StateDir)
vfsManager.StopAll()
// Read PID file
pidBytes, err := os.ReadFile(config.PIDFile) pidBytes, err := os.ReadFile(config.PIDFile)
if err != nil { if err != nil {
if os.IsNotExist(err) { if os.IsNotExist(err) {
// Not running cleanupStateFiles()
return mo.Ok(struct{}{}) return mo.Ok(struct{}{})
} }
return mo.Err[struct{}](fmt.Errorf("failed to read PID file: %w", err)) return mo.Err[struct{}](fmt.Errorf("failed to read PID file: %w", err))
@ -178,48 +267,55 @@ func Stop() mo.Result[struct{}] {
pid, err := strconv.Atoi(strings.TrimSpace(string(pidBytes))) pid, err := strconv.Atoi(strings.TrimSpace(string(pidBytes)))
if err != nil { if err != nil {
cleanupStateFiles()
return mo.Err[struct{}](fmt.Errorf("invalid PID in file: %w", err)) return mo.Err[struct{}](fmt.Errorf("invalid PID in file: %w", err))
} }
// Check if process exists // Check if process exists
process, err := os.FindProcess(pid) process, err := os.FindProcess(pid)
if err != nil { if err != nil {
// Process doesn't exist, clean up
cleanupStateFiles() cleanupStateFiles()
return mo.Ok(struct{}{}) return mo.Ok(struct{}{})
} }
// 2. Send SIGTERM for graceful shutdown // Send SIGTERM for graceful shutdown
if err := process.Signal(syscall.SIGTERM); err != nil { if err := process.Signal(syscall.SIGTERM); err != nil {
// Process already gone
cleanupStateFiles() cleanupStateFiles()
return mo.Ok(struct{}{}) return mo.Ok(struct{}{})
} }
// 3. Wait up to 30 seconds for process to exit (poll every second) // Wait up to 30 seconds for process to exit
for i := 0; i < 30; i++ { for i := 0; i < 30; i++ {
time.Sleep(1 * time.Second) time.Sleep(1 * time.Second)
// Check if process still exists by sending signal 0
if err := process.Signal(syscall.Signal(0)); err != nil { if err := process.Signal(syscall.Signal(0)); err != nil {
// Process no longer exists
cleanupStateFiles() cleanupStateFiles()
return mo.Ok(struct{}{}) return mo.Ok(struct{}{})
} }
} }
// 4. Timeout, force kill // Timeout, force kill
_ = process.Signal(syscall.SIGKILL) _ = process.Signal(syscall.SIGKILL)
// Wait a moment for SIGKILL to take effect
time.Sleep(1 * time.Second) time.Sleep(1 * time.Second)
// 5. Clean up state files
cleanupStateFiles() cleanupStateFiles()
return mo.Ok(struct{}{}) return mo.Ok(struct{}{})
} }
// stopQEMU stops just the QEMU process (used during failed starts)
func stopQEMU() {
pidBytes, err := os.ReadFile(config.PIDFile)
if err != nil {
return
}
pid, err := strconv.Atoi(strings.TrimSpace(string(pidBytes)))
if err != nil {
return
}
if process, err := os.FindProcess(pid); err == nil {
_ = process.Kill()
}
}
// cleanupStateFiles removes all VM state files // cleanupStateFiles removes all VM state files
func cleanupStateFiles() { func cleanupStateFiles() {
_ = os.Remove(config.PIDFile) _ = os.Remove(config.PIDFile)
@ -239,7 +335,6 @@ func Status() mo.Result[VMStatus] {
return mo.Ok(status) return mo.Ok(status)
} }
// Read PID
pidBytes, err := os.ReadFile(config.PIDFile) pidBytes, err := os.ReadFile(config.PIDFile)
if err != nil { if err != nil {
return mo.Err[VMStatus](fmt.Errorf("failed to read PID file: %w", err)) return mo.Err[VMStatus](fmt.Errorf("failed to read PID file: %w", err))
@ -250,7 +345,6 @@ func Status() mo.Result[VMStatus] {
return mo.Err[VMStatus](fmt.Errorf("invalid PID in file: %w", err)) return mo.Err[VMStatus](fmt.Errorf("invalid PID in file: %w", err))
} }
// Read SSH port
portBytes, err := os.ReadFile(config.SSHPortFile) portBytes, err := os.ReadFile(config.SSHPortFile)
if err != nil { if err != nil {
return mo.Err[VMStatus](fmt.Errorf("failed to read SSH port file: %w", err)) return mo.Err[VMStatus](fmt.Errorf("failed to read SSH port file: %w", err))
@ -269,15 +363,12 @@ func Status() mo.Result[VMStatus] {
} }
// Reset stops the VM and deletes the overlay image. // Reset stops the VM and deletes the overlay image.
// This returns the VM to a fresh state based on the base image.
func Reset() mo.Result[struct{}] { func Reset() mo.Result[struct{}] {
// Stop VM if running
stopResult := Stop() stopResult := Stop()
if stopResult.IsError() { if stopResult.IsError() {
return mo.Err[struct{}](fmt.Errorf("failed to stop VM: %w", stopResult.Error())) return mo.Err[struct{}](fmt.Errorf("failed to stop VM: %w", stopResult.Error()))
} }
// Delete overlay image
if err := os.Remove(config.Overlay); err != nil && !os.IsNotExist(err) { if err := os.Remove(config.Overlay); err != nil && !os.IsNotExist(err) {
return mo.Err[struct{}](fmt.Errorf("failed to delete overlay: %w", err)) return mo.Err[struct{}](fmt.Errorf("failed to delete overlay: %w", err))
} }
@ -285,8 +376,7 @@ func Reset() mo.Result[struct{}] {
return mo.Ok(struct{}{}) return mo.Ok(struct{}{})
} }
// IsRunning performs a quick check if the VM is running by checking // IsRunning performs a quick check if the VM is running.
// the PID file and verifying the process exists.
func IsRunning() bool { func IsRunning() bool {
pidBytes, err := os.ReadFile(config.PIDFile) pidBytes, err := os.ReadFile(config.PIDFile)
if err != nil { if err != nil {
@ -298,7 +388,6 @@ func IsRunning() bool {
return false return false
} }
// Check if process exists by sending signal 0
process, err := os.FindProcess(pid) process, err := os.FindProcess(pid)
if err != nil { if err != nil {
return false return false
@ -308,35 +397,6 @@ func IsRunning() bool {
return err == nil return err == nil
} }
func buildQEMUArgs(cfg *config.Config, sshPort int, mounts []Mount) []string {
// Boot directly from the qcow2 disk image (has GRUB installed)
// Do NOT use -kernel/-initrd - that's for NixOS VM runner which requires special 9p mounts
args := []string{
"-machine", "q35",
"-accel", "kvm",
"-cpu", "host",
"-m", cfg.VM.Memory,
"-smp", strconv.Itoa(cfg.VM.CPUs),
"-display", "none",
"-daemonize",
"-pidfile", config.PIDFile,
"-drive", fmt.Sprintf("file=%s,if=virtio,format=qcow2", config.Overlay),
"-netdev", fmt.Sprintf("user,id=n0,hostfwd=tcp::%d-:22", sshPort),
"-device", "virtio-net-pci,netdev=n0",
"-serial", fmt.Sprintf("file:%s", config.SerialLog),
}
// Add 9p mounts for cache directories and workspaces
for _, mount := range mounts {
args = append(args,
"-virtfs", fmt.Sprintf("local,path=%s,mount_tag=%s,security_model=mapped-xattr,id=%s",
mount.HostPath, mount.Tag, mount.Tag),
)
}
return args
}
// findAvailablePort finds an available TCP port starting from the given base port. // findAvailablePort finds an available TCP port starting from the given base port.
func findAvailablePort(basePort int) (int, error) { func findAvailablePort(basePort int) (int, error) {
const maxAttempts = 100 const maxAttempts = 100
@ -354,7 +414,6 @@ func findAvailablePort(basePort int) (int, error) {
} }
// waitForSSH waits for SSH to become available on the given port. // waitForSSH waits for SSH to become available on the given port.
// Uses sshpass with password 'root' to test connection.
func waitForSSH(port int, timeout time.Duration) error { func waitForSSH(port int, timeout time.Duration) error {
deadline := time.Now().Add(timeout) deadline := time.Now().Add(timeout)

View file

@ -7,42 +7,30 @@ import (
"strconv" "strconv"
) )
// buildQEMUCommand builds the QEMU command line for virtiofsd-based mounts.
// Uses vhost-user-fs-pci devices which support hot-plugging.
func buildQEMUCommand(cfg *config.Config, sshPort int, mounts []virtiofsd.Mount) []string { func buildQEMUCommand(cfg *config.Config, sshPort int, mounts []virtiofsd.Mount) []string {
args := []string{
"qemu-system-x86_64",
"-enable-kvm",
}
memSize := cfg.VM.Memory memSize := cfg.VM.Memory
args = append(args,
// vhost-user-fs requires shared memory backend
args := []string{
"-machine", "q35",
"-accel", "kvm",
"-cpu", "host",
"-object", fmt.Sprintf("memory-backend-memfd,id=mem,size=%s,share=on", memSize), "-object", fmt.Sprintf("memory-backend-memfd,id=mem,size=%s,share=on", memSize),
"-numa", "node,memdev=mem", "-numa", "node,memdev=mem",
)
args = append(args,
"-smp", strconv.Itoa(cfg.VM.CPUs), "-smp", strconv.Itoa(cfg.VM.CPUs),
)
args = append(args,
"-drive", fmt.Sprintf("if=virtio,file=%s,format=qcow2", config.Overlay),
)
args = append(args,
"-nic", fmt.Sprintf("user,model=virtio-net-pci,hostfwd=tcp::%d-:22", sshPort),
)
args = append(args,
"-serial", fmt.Sprintf("file:%s", config.SerialLog),
)
args = append(args,
"-qmp", fmt.Sprintf("unix:%s,server,nowait", config.QMPSocket),
)
args = append(args,
"-display", "none", "-display", "none",
) "-daemonize",
"-pidfile", config.PIDFile,
"-drive", fmt.Sprintf("file=%s,if=virtio,format=qcow2", config.Overlay),
"-netdev", fmt.Sprintf("user,id=n0,hostfwd=tcp::%d-:22", sshPort),
"-device", "virtio-net-pci,netdev=n0",
"-serial", fmt.Sprintf("file:%s", config.SerialLog),
"-qmp", fmt.Sprintf("unix:%s,server,nowait", config.QMPSocket),
}
// Add vhost-user-fs devices for each mount
for _, mount := range mounts { for _, mount := range mounts {
args = append(args, args = append(args,
"-chardev", fmt.Sprintf("socket,id=%s,path=%s", mount.Tag, mount.SocketPath), "-chardev", fmt.Sprintf("socket,id=%s,path=%s", mount.Tag, mount.SocketPath),