Add qvm doctor command to diagnose and fix common issues

This commit is contained in:
Joshua Bell 2026-01-27 11:54:26 -06:00
parent 2aec01b3b2
commit eb469f1cd8
8 changed files with 660 additions and 170 deletions

78
internal/lock/lock.go Normal file
View file

@ -0,0 +1,78 @@
// Package lock provides file-based locking for VM operations.
package lock
import (
"fmt"
"os"
"path/filepath"
"syscall"
"time"
"github.com/samber/mo"
"qvm/internal/config"
)
// Lock represents an exclusive file lock on VM operations.
type Lock struct {
file *os.File
}
var lockPath = filepath.Join(config.StateDir, "vm.lock")
// Acquire obtains an exclusive lock on VM operations.
// Blocks until the lock is available or timeout is reached.
func Acquire(timeout time.Duration) mo.Result[*Lock] {
if err := os.MkdirAll(config.StateDir, 0755); err != nil {
return mo.Err[*Lock](fmt.Errorf("failed to create state directory: %w", err))
}
file, err := os.OpenFile(lockPath, os.O_CREATE|os.O_RDWR, 0644)
if err != nil {
return mo.Err[*Lock](fmt.Errorf("failed to open lock file: %w", err))
}
deadline := time.Now().Add(timeout)
for {
err := syscall.Flock(int(file.Fd()), syscall.LOCK_EX|syscall.LOCK_NB)
if err == nil {
return mo.Ok(&Lock{file: file})
}
if time.Now().After(deadline) {
file.Close()
return mo.Err[*Lock](fmt.Errorf("timeout waiting for VM lock after %v", timeout))
}
time.Sleep(100 * time.Millisecond)
}
}
// TryAcquire attempts to obtain an exclusive lock without blocking.
// Returns error if lock is held by another process.
func TryAcquire() mo.Result[*Lock] {
if err := os.MkdirAll(config.StateDir, 0755); err != nil {
return mo.Err[*Lock](fmt.Errorf("failed to create state directory: %w", err))
}
file, err := os.OpenFile(lockPath, os.O_CREATE|os.O_RDWR, 0644)
if err != nil {
return mo.Err[*Lock](fmt.Errorf("failed to open lock file: %w", err))
}
err = syscall.Flock(int(file.Fd()), syscall.LOCK_EX|syscall.LOCK_NB)
if err != nil {
file.Close()
return mo.Err[*Lock](fmt.Errorf("VM operation in progress by another process"))
}
return mo.Ok(&Lock{file: file})
}
// Release releases the lock.
func (l *Lock) Release() error {
if l.file == nil {
return nil
}
syscall.Flock(int(l.file.Fd()), syscall.LOCK_UN)
return l.file.Close()
}

View file

@ -73,3 +73,88 @@ func (c *Client) Shutdown() mo.Result[struct{}] {
func (c *Client) Close() error {
return c.monitor.Disconnect()
}
// AddChardev adds a vhost-user-fs chardev for a virtiofsd socket.
// This is the first step of hot-mounting a filesystem.
func (c *Client) AddChardev(id, socketPath string) mo.Result[struct{}] {
cmd := fmt.Sprintf(`{"execute":"chardev-add","arguments":{"id":"%s","backend":{"type":"socket","data":{"addr":{"type":"unix","data":{"path":"%s"}},"server":false}}}}`, id, socketPath)
raw, err := c.monitor.Run([]byte(cmd))
if err != nil {
return mo.Err[struct{}](fmt.Errorf("failed to add chardev %s: %w", id, err))
}
// Check for error in response
var resp map[string]interface{}
if err := json.Unmarshal(raw, &resp); err == nil {
if errObj, ok := resp["error"]; ok {
return mo.Err[struct{}](fmt.Errorf("QMP error adding chardev: %v", errObj))
}
}
return mo.Ok(struct{}{})
}
// AddVhostUserFsDevice adds a vhost-user-fs-pci device connected to a chardev.
// This is the second step of hot-mounting a filesystem.
func (c *Client) AddVhostUserFsDevice(chardevID, tag string) mo.Result[struct{}] {
// Device ID must be unique, derive from chardev ID
deviceID := "dev_" + chardevID
cmd := fmt.Sprintf(`{"execute":"device_add","arguments":{"driver":"vhost-user-fs-pci","chardev":"%s","tag":"%s","id":"%s","queue-size":1024}}`, chardevID, tag, deviceID)
raw, err := c.monitor.Run([]byte(cmd))
if err != nil {
return mo.Err[struct{}](fmt.Errorf("failed to add vhost-user-fs device for %s: %w", tag, err))
}
// Check for error in response
var resp map[string]interface{}
if err := json.Unmarshal(raw, &resp); err == nil {
if errObj, ok := resp["error"]; ok {
return mo.Err[struct{}](fmt.Errorf("QMP error adding device: %v", errObj))
}
}
return mo.Ok(struct{}{})
}
// HotMountFilesystem performs a complete hot-mount of a virtiofsd filesystem.
// Requires the virtiofsd daemon to already be running and listening on socketPath.
func (c *Client) HotMountFilesystem(tag, socketPath string) mo.Result[struct{}] {
// Use tag as chardev ID for simplicity
chardevID := tag
// Step 1: Add chardev
if result := c.AddChardev(chardevID, socketPath); result.IsError() {
return result
}
// Step 2: Add device
if result := c.AddVhostUserFsDevice(chardevID, tag); result.IsError() {
return result
}
return mo.Ok(struct{}{})
}
// QueryChardevs lists all current chardevs to check if one exists.
func (c *Client) QueryChardevs() mo.Result[[]string] {
cmd := []byte(`{"execute":"query-chardev"}`)
raw, err := c.monitor.Run(cmd)
if err != nil {
return mo.Err[[]string](fmt.Errorf("failed to query chardevs: %w", err))
}
var resp struct {
Return []struct {
Label string `json:"label"`
} `json:"return"`
}
if err := json.Unmarshal(raw, &resp); err != nil {
return mo.Err[[]string](fmt.Errorf("failed to parse chardev response: %w", err))
}
var labels []string
for _, c := range resp.Return {
labels = append(labels, c.Label)
}
return mo.Ok(labels)
}

View file

@ -5,7 +5,10 @@ import (
"os"
"os/exec"
"qvm/internal/config"
"qvm/internal/lock"
"qvm/internal/logging"
"qvm/internal/qmp"
"qvm/internal/virtiofsd"
"qvm/internal/workspace"
"strconv"
"strings"
@ -22,45 +25,48 @@ type VMStatus struct {
SSHPort int
}
// Mount represents a 9p filesystem mount
type Mount struct {
Tag string
HostPath string
}
// Start launches the VM with all configured mounts.
// Start launches the VM with all configured mounts using virtiofsd.
// Sequence:
// 1. Check if VM is already running (via PID file and process check)
// 2. Ensure all required directories exist
// 3. Build mount list (cache mounts + workspace mounts from registry)
// 4. Find available SSH port
// 5. Build and start VM via runner script with 9p virtfs mounts
// 6. Write PID and SSH port to state files
// 7. Wait for SSH to become available (60 second timeout)
// 1. Acquire exclusive lock (prevents race conditions from concurrent qvm run)
// 2. Check if VM is already running (via PID file and process check)
// 3. Clean up any stale state files from previous failed starts
// 4. Ensure all required directories exist
// 5. Start virtiofsd daemons for all mounts (cache + workspaces)
// 6. Find available SSH port
// 7. Build and start QEMU with vhost-user-fs-pci devices
// 8. Write PID and SSH port to state files
// 9. Wait for SSH to become available (120 second timeout)
//
// Returns error if any step fails.
func Start(cfg *config.Config, reg *workspace.Registry) mo.Result[struct{}] {
// 1. Check if already running
// 1. Acquire exclusive lock
lockResult := lock.Acquire(30 * time.Second)
if lockResult.IsError() {
return mo.Err[struct{}](lockResult.Error())
}
vmLock := lockResult.MustGet()
defer vmLock.Release()
// 2. Check if already running
if IsRunning() {
return mo.Err[struct{}](fmt.Errorf("VM is already running"))
}
// 2. Ensure directories exist
// 3. Clean up any stale state files
cleanupStateFiles()
// 4. Ensure directories exist
if err := config.EnsureDirs(); err != nil {
return mo.Err[struct{}](fmt.Errorf("failed to create directories: %w", err))
}
// 2a. Check if base image exists
// 4a. Check if base image exists
if _, err := os.Stat(config.BaseImage); os.IsNotExist(err) {
return mo.Err[struct{}](fmt.Errorf("base image not found at %s - run 'qvm rebuild' first", config.BaseImage))
}
// 2b. Create overlay if it doesn't exist (backed by base image)
// 4b. Create overlay if it doesn't exist (backed by base image)
if _, err := os.Stat(config.Overlay); os.IsNotExist(err) {
if _, err := os.Stat(config.BaseImage); os.IsNotExist(err) {
return mo.Err[struct{}](fmt.Errorf("base image not found at %s - run 'qvm rebuild' first", config.BaseImage))
}
logging.Info("Creating overlay image backed by base image...")
cmd := exec.Command("qemu-img", "create", "-f", "qcow2",
"-F", "qcow2", "-b", config.BaseImage, config.Overlay)
@ -69,46 +75,49 @@ func Start(cfg *config.Config, reg *workspace.Registry) mo.Result[struct{}] {
}
}
// 3. Build mount list (for 9p virtfs)
mounts := []Mount{
{Tag: "cargo_home", HostPath: config.CargoHome},
{Tag: "cargo_target", HostPath: config.CargoTarget},
{Tag: "pnpm_store", HostPath: config.PnpmStore},
{Tag: "sccache", HostPath: config.Sccache},
}
// 5. Build mount list and start virtiofsd daemons
mounts := virtiofsd.DefaultCacheMounts()
// Add opencode config mount if directory exists
if _, err := os.Stat(config.HostOpencodeConfig); err == nil {
mounts = append(mounts, Mount{
Tag: "opencode_config",
HostPath: config.HostOpencodeConfig,
mounts = append(mounts, virtiofsd.Mount{
Tag: "opencode_config",
HostPath: config.HostOpencodeConfig,
SocketPath: config.StateDir + "/opencode_config.sock",
})
}
// Add workspace mounts from registry
for _, ws := range reg.List() {
mounts = append(mounts, Mount{
Tag: ws.MountTag,
HostPath: ws.HostPath,
})
mounts = append(mounts, virtiofsd.WorkspaceMount(ws.MountTag, ws.HostPath))
}
// 4. Find available SSH port
// Start virtiofsd manager
vfsManager := virtiofsd.NewManager(config.StateDir)
startResult := vfsManager.StartAll(mounts)
if startResult.IsError() {
return mo.Err[struct{}](fmt.Errorf("failed to start virtiofsd daemons: %w", startResult.Error()))
}
// 6. Find available SSH port
sshPort, err := findAvailablePort(2222)
if err != nil {
vfsManager.StopAll()
return mo.Err[struct{}](fmt.Errorf("failed to find available SSH port: %w", err))
}
// 5. Build QEMU command and start VM directly
args := buildQEMUArgs(cfg, sshPort, mounts)
cmd := exec.Command("qemu-system-x86_64", args...)
// 7. Build QEMU command and start VM
args := buildQEMUCommand(cfg, sshPort, mounts)
logging.Info(fmt.Sprintf("Starting QEMU with %d mounts...", len(mounts)))
cmd := exec.Command("qemu-system-x86_64", args...)
cmd.Stdout = nil
cmd.Stderr = nil
cmd.Stdin = nil
if err := cmd.Run(); err != nil {
return mo.Err[struct{}](fmt.Errorf("failed to start QEMU: %w", err))
vfsManager.StopAll()
return mo.Err[struct{}](fmt.Errorf("failed to start QEMU: %w\n\nTry running 'qvm doctor' to diagnose the issue.", err))
}
logging.Info("Waiting for VM to daemonize...")
@ -122,11 +131,13 @@ func Start(cfg *config.Config, reg *workspace.Registry) mo.Result[struct{}] {
}
if !pidFileReady {
return mo.Err[struct{}](fmt.Errorf("QEMU did not create PID file after 5 seconds"))
vfsManager.StopAll()
return mo.Err[struct{}](fmt.Errorf("QEMU did not create PID file after 5 seconds\n\nTry running 'qvm doctor' to diagnose the issue."))
}
pidBytes, err := os.ReadFile(config.PIDFile)
if err != nil {
vfsManager.StopAll()
return mo.Err[struct{}](fmt.Errorf("failed to read PID file: %w", err))
}
pid := strings.TrimSpace(string(pidBytes))
@ -134,43 +145,121 @@ func Start(cfg *config.Config, reg *workspace.Registry) mo.Result[struct{}] {
logging.Info("VM started with PID " + pid)
if err := os.WriteFile(config.SSHPortFile, []byte(strconv.Itoa(sshPort)), 0644); err != nil {
if pidBytes, err := os.ReadFile(config.PIDFile); err == nil {
if pid, err := strconv.Atoi(strings.TrimSpace(string(pidBytes))); err == nil {
if process, err := os.FindProcess(pid); err == nil {
_ = process.Kill()
}
}
}
_ = os.Remove(config.PIDFile)
stopQEMU()
vfsManager.StopAll()
return mo.Err[struct{}](fmt.Errorf("failed to write SSH port file: %w", err))
}
// 7. Wait for SSH
// 9. Wait for SSH
if err := waitForSSH(sshPort, 120*time.Second); err != nil {
_ = cmd.Process.Kill()
_ = os.Remove(config.PIDFile)
_ = os.Remove(config.SSHPortFile)
return mo.Err[struct{}](fmt.Errorf("VM started but SSH not available: %w", err))
stopQEMU()
vfsManager.StopAll()
cleanupStateFiles()
return mo.Err[struct{}](fmt.Errorf("VM started but SSH not available: %w\n\nTry running 'qvm doctor' to diagnose the issue.", err))
}
return mo.Ok(struct{}{})
}
// Stop gracefully shuts down the VM.
// HotMountWorkspace mounts a new workspace into a running VM without restart.
// This uses QMP to dynamically add a vhost-user-fs-pci device.
func HotMountWorkspace(ws *workspace.Workspace) mo.Result[struct{}] {
if !IsRunning() {
return mo.Err[struct{}](fmt.Errorf("VM is not running"))
}
// Acquire lock to prevent concurrent hot-mounts
lockResult := lock.Acquire(10 * time.Second)
if lockResult.IsError() {
return mo.Err[struct{}](lockResult.Error())
}
vmLock := lockResult.MustGet()
defer vmLock.Release()
// Start virtiofsd for this workspace
mount := virtiofsd.WorkspaceMount(ws.MountTag, ws.HostPath)
vfsManager := virtiofsd.NewManager(config.StateDir)
startResult := vfsManager.StartMount(mount)
if startResult.IsError() {
return mo.Err[struct{}](fmt.Errorf("failed to start virtiofsd: %w", startResult.Error()))
}
// Connect to QMP and hot-add the device
qmpResult := qmp.Connect(config.QMPSocket)
if qmpResult.IsError() {
vfsManager.StopMount(mount)
return mo.Err[struct{}](fmt.Errorf("failed to connect to QMP: %w", qmpResult.Error()))
}
client := qmpResult.MustGet()
defer client.Close()
// Hot-mount the filesystem
hotMountResult := client.HotMountFilesystem(ws.MountTag, mount.SocketPath)
if hotMountResult.IsError() {
vfsManager.StopMount(mount)
return mo.Err[struct{}](fmt.Errorf("failed to hot-mount filesystem: %w", hotMountResult.Error()))
}
logging.Info(fmt.Sprintf("Hot-mounted workspace %s", ws.MountTag))
return mo.Ok(struct{}{})
}
// IsWorkspaceMounted checks if a workspace is already mounted in the running VM.
func IsWorkspaceMounted(ws *workspace.Workspace) bool {
if !IsRunning() {
return false
}
qmpResult := qmp.Connect(config.QMPSocket)
if qmpResult.IsError() {
return false
}
client := qmpResult.MustGet()
defer client.Close()
chardevsResult := client.QueryChardevs()
if chardevsResult.IsError() {
return false
}
for _, label := range chardevsResult.MustGet() {
if label == ws.MountTag {
return true
}
}
return false
}
// Stop gracefully shuts down the VM and virtiofsd daemons.
// Sequence:
// 1. Read PID from file
// 2. Send SIGTERM to the process
// 3. Wait up to 30 seconds for graceful shutdown (poll every second)
// 4. If still running, send SIGKILL
// 5. Clean up PID and port files
// 1. Acquire lock
// 2. Read PID from file
// 3. Send SIGTERM to the process
// 4. Wait up to 30 seconds for graceful shutdown
// 5. If still running, send SIGKILL
// 6. Stop all virtiofsd daemons
// 7. Clean up state files
//
// Returns success even if VM is not running (idempotent).
func Stop() mo.Result[struct{}] {
// 1. Read PID file
// Acquire lock
lockResult := lock.Acquire(30 * time.Second)
if lockResult.IsError() {
return mo.Err[struct{}](lockResult.Error())
}
vmLock := lockResult.MustGet()
defer vmLock.Release()
// Stop virtiofsd daemons first
vfsManager := virtiofsd.NewManager(config.StateDir)
vfsManager.StopAll()
// Read PID file
pidBytes, err := os.ReadFile(config.PIDFile)
if err != nil {
if os.IsNotExist(err) {
// Not running
cleanupStateFiles()
return mo.Ok(struct{}{})
}
return mo.Err[struct{}](fmt.Errorf("failed to read PID file: %w", err))
@ -178,48 +267,55 @@ func Stop() mo.Result[struct{}] {
pid, err := strconv.Atoi(strings.TrimSpace(string(pidBytes)))
if err != nil {
cleanupStateFiles()
return mo.Err[struct{}](fmt.Errorf("invalid PID in file: %w", err))
}
// Check if process exists
process, err := os.FindProcess(pid)
if err != nil {
// Process doesn't exist, clean up
cleanupStateFiles()
return mo.Ok(struct{}{})
}
// 2. Send SIGTERM for graceful shutdown
// Send SIGTERM for graceful shutdown
if err := process.Signal(syscall.SIGTERM); err != nil {
// Process already gone
cleanupStateFiles()
return mo.Ok(struct{}{})
}
// 3. Wait up to 30 seconds for process to exit (poll every second)
// Wait up to 30 seconds for process to exit
for i := 0; i < 30; i++ {
time.Sleep(1 * time.Second)
// Check if process still exists by sending signal 0
if err := process.Signal(syscall.Signal(0)); err != nil {
// Process no longer exists
cleanupStateFiles()
return mo.Ok(struct{}{})
}
}
// 4. Timeout, force kill
// Timeout, force kill
_ = process.Signal(syscall.SIGKILL)
// Wait a moment for SIGKILL to take effect
time.Sleep(1 * time.Second)
// 5. Clean up state files
cleanupStateFiles()
return mo.Ok(struct{}{})
}
// stopQEMU stops just the QEMU process (used during failed starts)
func stopQEMU() {
pidBytes, err := os.ReadFile(config.PIDFile)
if err != nil {
return
}
pid, err := strconv.Atoi(strings.TrimSpace(string(pidBytes)))
if err != nil {
return
}
if process, err := os.FindProcess(pid); err == nil {
_ = process.Kill()
}
}
// cleanupStateFiles removes all VM state files
func cleanupStateFiles() {
_ = os.Remove(config.PIDFile)
@ -239,7 +335,6 @@ func Status() mo.Result[VMStatus] {
return mo.Ok(status)
}
// Read PID
pidBytes, err := os.ReadFile(config.PIDFile)
if err != nil {
return mo.Err[VMStatus](fmt.Errorf("failed to read PID file: %w", err))
@ -250,7 +345,6 @@ func Status() mo.Result[VMStatus] {
return mo.Err[VMStatus](fmt.Errorf("invalid PID in file: %w", err))
}
// Read SSH port
portBytes, err := os.ReadFile(config.SSHPortFile)
if err != nil {
return mo.Err[VMStatus](fmt.Errorf("failed to read SSH port file: %w", err))
@ -269,15 +363,12 @@ func Status() mo.Result[VMStatus] {
}
// Reset stops the VM and deletes the overlay image.
// This returns the VM to a fresh state based on the base image.
func Reset() mo.Result[struct{}] {
// Stop VM if running
stopResult := Stop()
if stopResult.IsError() {
return mo.Err[struct{}](fmt.Errorf("failed to stop VM: %w", stopResult.Error()))
}
// Delete overlay image
if err := os.Remove(config.Overlay); err != nil && !os.IsNotExist(err) {
return mo.Err[struct{}](fmt.Errorf("failed to delete overlay: %w", err))
}
@ -285,8 +376,7 @@ func Reset() mo.Result[struct{}] {
return mo.Ok(struct{}{})
}
// IsRunning performs a quick check if the VM is running by checking
// the PID file and verifying the process exists.
// IsRunning performs a quick check if the VM is running.
func IsRunning() bool {
pidBytes, err := os.ReadFile(config.PIDFile)
if err != nil {
@ -298,7 +388,6 @@ func IsRunning() bool {
return false
}
// Check if process exists by sending signal 0
process, err := os.FindProcess(pid)
if err != nil {
return false
@ -308,35 +397,6 @@ func IsRunning() bool {
return err == nil
}
func buildQEMUArgs(cfg *config.Config, sshPort int, mounts []Mount) []string {
// Boot directly from the qcow2 disk image (has GRUB installed)
// Do NOT use -kernel/-initrd - that's for NixOS VM runner which requires special 9p mounts
args := []string{
"-machine", "q35",
"-accel", "kvm",
"-cpu", "host",
"-m", cfg.VM.Memory,
"-smp", strconv.Itoa(cfg.VM.CPUs),
"-display", "none",
"-daemonize",
"-pidfile", config.PIDFile,
"-drive", fmt.Sprintf("file=%s,if=virtio,format=qcow2", config.Overlay),
"-netdev", fmt.Sprintf("user,id=n0,hostfwd=tcp::%d-:22", sshPort),
"-device", "virtio-net-pci,netdev=n0",
"-serial", fmt.Sprintf("file:%s", config.SerialLog),
}
// Add 9p mounts for cache directories and workspaces
for _, mount := range mounts {
args = append(args,
"-virtfs", fmt.Sprintf("local,path=%s,mount_tag=%s,security_model=mapped-xattr,id=%s",
mount.HostPath, mount.Tag, mount.Tag),
)
}
return args
}
// findAvailablePort finds an available TCP port starting from the given base port.
func findAvailablePort(basePort int) (int, error) {
const maxAttempts = 100
@ -354,7 +414,6 @@ func findAvailablePort(basePort int) (int, error) {
}
// waitForSSH waits for SSH to become available on the given port.
// Uses sshpass with password 'root' to test connection.
func waitForSSH(port int, timeout time.Duration) error {
deadline := time.Now().Add(timeout)

View file

@ -7,42 +7,30 @@ import (
"strconv"
)
// buildQEMUCommand builds the QEMU command line for virtiofsd-based mounts.
// Uses vhost-user-fs-pci devices which support hot-plugging.
func buildQEMUCommand(cfg *config.Config, sshPort int, mounts []virtiofsd.Mount) []string {
args := []string{
"qemu-system-x86_64",
"-enable-kvm",
}
memSize := cfg.VM.Memory
args = append(args,
// vhost-user-fs requires shared memory backend
args := []string{
"-machine", "q35",
"-accel", "kvm",
"-cpu", "host",
"-object", fmt.Sprintf("memory-backend-memfd,id=mem,size=%s,share=on", memSize),
"-numa", "node,memdev=mem",
)
args = append(args,
"-smp", strconv.Itoa(cfg.VM.CPUs),
)
args = append(args,
"-drive", fmt.Sprintf("if=virtio,file=%s,format=qcow2", config.Overlay),
)
args = append(args,
"-nic", fmt.Sprintf("user,model=virtio-net-pci,hostfwd=tcp::%d-:22", sshPort),
)
args = append(args,
"-serial", fmt.Sprintf("file:%s", config.SerialLog),
)
args = append(args,
"-qmp", fmt.Sprintf("unix:%s,server,nowait", config.QMPSocket),
)
args = append(args,
"-display", "none",
)
"-daemonize",
"-pidfile", config.PIDFile,
"-drive", fmt.Sprintf("file=%s,if=virtio,format=qcow2", config.Overlay),
"-netdev", fmt.Sprintf("user,id=n0,hostfwd=tcp::%d-:22", sshPort),
"-device", "virtio-net-pci,netdev=n0",
"-serial", fmt.Sprintf("file:%s", config.SerialLog),
"-qmp", fmt.Sprintf("unix:%s,server,nowait", config.QMPSocket),
}
// Add vhost-user-fs devices for each mount
for _, mount := range mounts {
args = append(args,
"-chardev", fmt.Sprintf("socket,id=%s,path=%s", mount.Tag, mount.SocketPath),