qvm/internal/vm/lifecycle.go

438 lines
12 KiB
Go

package vm
import (
"fmt"
"os"
"os/exec"
"qvm/internal/config"
"qvm/internal/lock"
"qvm/internal/logging"
"qvm/internal/qmp"
"qvm/internal/virtiofsd"
"qvm/internal/workspace"
"strconv"
"strings"
"syscall"
"time"
"github.com/samber/mo"
)
// VMStatus represents the current state of the VM
type VMStatus struct {
Running bool
PID int
SSHPort int
}
// Start launches the VM with all configured mounts using virtiofsd.
// Sequence:
// 1. Acquire exclusive lock (prevents race conditions from concurrent qvm run)
// 2. Check if VM is already running (via PID file and process check)
// 3. Clean up any stale state files from previous failed starts
// 4. Ensure all required directories exist
// 5. Start virtiofsd daemons for all mounts (cache + workspaces)
// 6. Find available SSH port
// 7. Build and start QEMU with vhost-user-fs-pci devices
// 8. Write PID and SSH port to state files
// 9. Wait for SSH to become available (120 second timeout)
//
// Returns error if any step fails.
func Start(cfg *config.Config, reg *workspace.Registry) mo.Result[struct{}] {
// 1. Acquire exclusive lock
lockResult := lock.Acquire(30 * time.Second)
if lockResult.IsError() {
return mo.Err[struct{}](lockResult.Error())
}
vmLock := lockResult.MustGet()
defer vmLock.Release()
// 2. Check if already running
if IsRunning() {
return mo.Err[struct{}](fmt.Errorf("VM is already running"))
}
// 3. Clean up any stale state files
cleanupStateFiles()
// 4. Ensure directories exist
if err := config.EnsureDirs(); err != nil {
return mo.Err[struct{}](fmt.Errorf("failed to create directories: %w", err))
}
// 4a. Check if base image exists
if _, err := os.Stat(config.BaseImage); os.IsNotExist(err) {
return mo.Err[struct{}](fmt.Errorf("base image not found at %s - run 'qvm rebuild' first", config.BaseImage))
}
// 4b. Create overlay if it doesn't exist (backed by base image)
if _, err := os.Stat(config.Overlay); os.IsNotExist(err) {
logging.Info("Creating overlay image backed by base image...")
cmd := exec.Command("qemu-img", "create", "-f", "qcow2",
"-F", "qcow2", "-b", config.BaseImage, config.Overlay)
if output, err := cmd.CombinedOutput(); err != nil {
return mo.Err[struct{}](fmt.Errorf("failed to create overlay: %s: %w", string(output), err))
}
}
// 5. Build mount list and start virtiofsd daemons
mounts := virtiofsd.DefaultCacheMounts()
// Add opencode config mount if directory exists
if _, err := os.Stat(config.HostOpencodeConfig); err == nil {
mounts = append(mounts, virtiofsd.Mount{
Tag: "opencode_config",
HostPath: config.HostOpencodeConfig,
SocketPath: config.StateDir + "/opencode_config.sock",
})
}
// Add workspace mounts from registry
for _, ws := range reg.List() {
mounts = append(mounts, virtiofsd.WorkspaceMount(ws.MountTag, ws.HostPath))
}
// Start virtiofsd manager
vfsManager := virtiofsd.NewManager(config.StateDir)
startResult := vfsManager.StartAll(mounts)
if startResult.IsError() {
return mo.Err[struct{}](fmt.Errorf("failed to start virtiofsd daemons: %w", startResult.Error()))
}
// 6. Find available SSH port
sshPort, err := findAvailablePort(2222)
if err != nil {
vfsManager.StopAll()
return mo.Err[struct{}](fmt.Errorf("failed to find available SSH port: %w", err))
}
// 7. Build QEMU command and start VM
args := buildQEMUCommand(cfg, sshPort, mounts)
logging.Info(fmt.Sprintf("Starting QEMU with %d mounts...", len(mounts)))
cmd := exec.Command("qemu-system-x86_64", args...)
cmd.Stdout = nil
cmd.Stderr = nil
cmd.Stdin = nil
if err := cmd.Run(); err != nil {
vfsManager.StopAll()
return mo.Err[struct{}](fmt.Errorf("failed to start QEMU: %w\n\nTry running 'qvm doctor' to diagnose the issue.", err))
}
logging.Info("Waiting for VM to daemonize...")
pidFileReady := false
for i := 0; i < 10; i++ {
time.Sleep(500 * time.Millisecond)
if _, err := os.Stat(config.PIDFile); err == nil {
pidFileReady = true
break
}
}
if !pidFileReady {
vfsManager.StopAll()
return mo.Err[struct{}](fmt.Errorf("QEMU did not create PID file after 5 seconds\n\nTry running 'qvm doctor' to diagnose the issue."))
}
pidBytes, err := os.ReadFile(config.PIDFile)
if err != nil {
vfsManager.StopAll()
return mo.Err[struct{}](fmt.Errorf("failed to read PID file: %w", err))
}
pid := strings.TrimSpace(string(pidBytes))
logging.Info("VM started with PID " + pid)
if err := os.WriteFile(config.SSHPortFile, []byte(strconv.Itoa(sshPort)), 0644); err != nil {
stopQEMU()
vfsManager.StopAll()
return mo.Err[struct{}](fmt.Errorf("failed to write SSH port file: %w", err))
}
// 9. Wait for SSH
if err := waitForSSH(sshPort, 120*time.Second); err != nil {
stopQEMU()
vfsManager.StopAll()
cleanupStateFiles()
return mo.Err[struct{}](fmt.Errorf("VM started but SSH not available: %w\n\nTry running 'qvm doctor' to diagnose the issue.", err))
}
return mo.Ok(struct{}{})
}
// HotMountWorkspace mounts a new workspace into a running VM without restart.
// This uses QMP to dynamically add a vhost-user-fs-pci device.
func HotMountWorkspace(ws *workspace.Workspace) mo.Result[struct{}] {
if !IsRunning() {
return mo.Err[struct{}](fmt.Errorf("VM is not running"))
}
// Acquire lock to prevent concurrent hot-mounts
lockResult := lock.Acquire(10 * time.Second)
if lockResult.IsError() {
return mo.Err[struct{}](lockResult.Error())
}
vmLock := lockResult.MustGet()
defer vmLock.Release()
// Start virtiofsd for this workspace
mount := virtiofsd.WorkspaceMount(ws.MountTag, ws.HostPath)
vfsManager := virtiofsd.NewManager(config.StateDir)
startResult := vfsManager.StartMount(mount)
if startResult.IsError() {
return mo.Err[struct{}](fmt.Errorf("failed to start virtiofsd: %w", startResult.Error()))
}
// Connect to QMP and hot-add the device
qmpResult := qmp.Connect(config.QMPSocket)
if qmpResult.IsError() {
vfsManager.StopMount(mount)
return mo.Err[struct{}](fmt.Errorf("failed to connect to QMP: %w", qmpResult.Error()))
}
client := qmpResult.MustGet()
defer client.Close()
// Hot-mount the filesystem
hotMountResult := client.HotMountFilesystem(ws.MountTag, mount.SocketPath)
if hotMountResult.IsError() {
vfsManager.StopMount(mount)
return mo.Err[struct{}](fmt.Errorf("failed to hot-mount filesystem: %w", hotMountResult.Error()))
}
logging.Info(fmt.Sprintf("Hot-mounted workspace %s", ws.MountTag))
return mo.Ok(struct{}{})
}
// IsWorkspaceMounted checks if a workspace is already mounted in the running VM.
func IsWorkspaceMounted(ws *workspace.Workspace) bool {
if !IsRunning() {
return false
}
qmpResult := qmp.Connect(config.QMPSocket)
if qmpResult.IsError() {
return false
}
client := qmpResult.MustGet()
defer client.Close()
chardevsResult := client.QueryChardevs()
if chardevsResult.IsError() {
return false
}
for _, label := range chardevsResult.MustGet() {
if label == ws.MountTag {
return true
}
}
return false
}
// Stop gracefully shuts down the VM and virtiofsd daemons.
// Sequence:
// 1. Acquire lock
// 2. Read PID from file
// 3. Send SIGTERM to the process
// 4. Wait up to 30 seconds for graceful shutdown
// 5. If still running, send SIGKILL
// 6. Stop all virtiofsd daemons
// 7. Clean up state files
//
// Returns success even if VM is not running (idempotent).
func Stop() mo.Result[struct{}] {
// Acquire lock
lockResult := lock.Acquire(30 * time.Second)
if lockResult.IsError() {
return mo.Err[struct{}](lockResult.Error())
}
vmLock := lockResult.MustGet()
defer vmLock.Release()
// Stop virtiofsd daemons first
vfsManager := virtiofsd.NewManager(config.StateDir)
vfsManager.StopAll()
// Read PID file
pidBytes, err := os.ReadFile(config.PIDFile)
if err != nil {
if os.IsNotExist(err) {
cleanupStateFiles()
return mo.Ok(struct{}{})
}
return mo.Err[struct{}](fmt.Errorf("failed to read PID file: %w", err))
}
pid, err := strconv.Atoi(strings.TrimSpace(string(pidBytes)))
if err != nil {
cleanupStateFiles()
return mo.Err[struct{}](fmt.Errorf("invalid PID in file: %w", err))
}
// Check if process exists
process, err := os.FindProcess(pid)
if err != nil {
cleanupStateFiles()
return mo.Ok(struct{}{})
}
// Send SIGTERM for graceful shutdown
if err := process.Signal(syscall.SIGTERM); err != nil {
cleanupStateFiles()
return mo.Ok(struct{}{})
}
// Wait up to 30 seconds for process to exit
for i := 0; i < 30; i++ {
time.Sleep(1 * time.Second)
if err := process.Signal(syscall.Signal(0)); err != nil {
cleanupStateFiles()
return mo.Ok(struct{}{})
}
}
// Timeout, force kill
_ = process.Signal(syscall.SIGKILL)
time.Sleep(1 * time.Second)
cleanupStateFiles()
return mo.Ok(struct{}{})
}
// stopQEMU stops just the QEMU process (used during failed starts)
func stopQEMU() {
pidBytes, err := os.ReadFile(config.PIDFile)
if err != nil {
return
}
pid, err := strconv.Atoi(strings.TrimSpace(string(pidBytes)))
if err != nil {
return
}
if process, err := os.FindProcess(pid); err == nil {
_ = process.Kill()
}
}
// cleanupStateFiles removes all VM state files
func cleanupStateFiles() {
_ = os.Remove(config.PIDFile)
_ = os.Remove(config.SSHPortFile)
_ = os.Remove(config.QMPSocket)
}
// Status returns the current VM status (running, PID, SSH port).
func Status() mo.Result[VMStatus] {
status := VMStatus{
Running: false,
PID: 0,
SSHPort: 0,
}
if !IsRunning() {
return mo.Ok(status)
}
pidBytes, err := os.ReadFile(config.PIDFile)
if err != nil {
return mo.Err[VMStatus](fmt.Errorf("failed to read PID file: %w", err))
}
pid, err := strconv.Atoi(strings.TrimSpace(string(pidBytes)))
if err != nil {
return mo.Err[VMStatus](fmt.Errorf("invalid PID in file: %w", err))
}
portBytes, err := os.ReadFile(config.SSHPortFile)
if err != nil {
return mo.Err[VMStatus](fmt.Errorf("failed to read SSH port file: %w", err))
}
sshPort, err := strconv.Atoi(strings.TrimSpace(string(portBytes)))
if err != nil {
return mo.Err[VMStatus](fmt.Errorf("invalid SSH port in file: %w", err))
}
status.Running = true
status.PID = pid
status.SSHPort = sshPort
return mo.Ok(status)
}
// Reset stops the VM and deletes the overlay image.
func Reset() mo.Result[struct{}] {
stopResult := Stop()
if stopResult.IsError() {
return mo.Err[struct{}](fmt.Errorf("failed to stop VM: %w", stopResult.Error()))
}
if err := os.Remove(config.Overlay); err != nil && !os.IsNotExist(err) {
return mo.Err[struct{}](fmt.Errorf("failed to delete overlay: %w", err))
}
return mo.Ok(struct{}{})
}
// IsRunning performs a quick check if the VM is running.
func IsRunning() bool {
pidBytes, err := os.ReadFile(config.PIDFile)
if err != nil {
return false
}
pid, err := strconv.Atoi(strings.TrimSpace(string(pidBytes)))
if err != nil {
return false
}
process, err := os.FindProcess(pid)
if err != nil {
return false
}
err = process.Signal(syscall.Signal(0))
return err == nil
}
// findAvailablePort finds an available TCP port starting from the given base port.
func findAvailablePort(basePort int) (int, error) {
const maxAttempts = 100
for i := 0; i < maxAttempts; i++ {
port := basePort + i
cmd := exec.Command("nc", "-z", "localhost", strconv.Itoa(port))
if err := cmd.Run(); err != nil {
return port, nil
}
}
return 0, fmt.Errorf("could not find available port after %d attempts", maxAttempts)
}
// waitForSSH waits for SSH to become available on the given port.
func waitForSSH(port int, timeout time.Duration) error {
deadline := time.Now().Add(timeout)
for time.Now().Before(deadline) {
cmd := exec.Command("sshpass", "-p", "root",
"ssh",
"-o", "StrictHostKeyChecking=no",
"-o", "UserKnownHostsFile=/dev/null",
"-o", "ConnectTimeout=1",
"-p", strconv.Itoa(port),
"root@localhost",
"exit 0")
if err := cmd.Run(); err == nil {
return nil
}
time.Sleep(1 * time.Second)
}
return fmt.Errorf("SSH did not become available within %v", timeout)
}