Add migrating services guide and idea drafts, update flake.lock
This commit is contained in:
parent
05f31b80c2
commit
78469896c8
6 changed files with 1779 additions and 3 deletions
425
docs/migrating_services.md
Normal file
425
docs/migrating_services.md
Normal file
|
|
@ -0,0 +1,425 @@
|
||||||
|
# Migrating Services Between Hosts
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
This document covers procedures for migrating services between NixOS hosts with minimal downtime.
|
||||||
|
|
||||||
|
## General Migration Strategy
|
||||||
|
|
||||||
|
### Pre-Migration Checklist
|
||||||
|
|
||||||
|
- [ ] New host is configured in flake with identical service config
|
||||||
|
- [ ] New host has required secrets (agenix/sops)
|
||||||
|
- [ ] Network connectivity verified (Tailscale IP assigned)
|
||||||
|
- [ ] Disk space sufficient on new host
|
||||||
|
- [ ] Backup of current state completed
|
||||||
|
|
||||||
|
### Migration Types
|
||||||
|
|
||||||
|
| Type | Downtime | Complexity | Use When |
|
||||||
|
|------|----------|------------|----------|
|
||||||
|
| Cold migration | 5-30 min | Low | Simple services, maintenance windows |
|
||||||
|
| Warm migration | 2-5 min | Medium | Most services |
|
||||||
|
| Hot migration | <1 min | High | Databases with replication |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Cold Migration (Simple)
|
||||||
|
|
||||||
|
Best for: Stateless or rarely-accessed services.
|
||||||
|
|
||||||
|
### Steps
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Stop service on old host
|
||||||
|
ssh oldhost 'systemctl stop myservice'
|
||||||
|
|
||||||
|
# 2. Copy state to new host
|
||||||
|
rsync -avz --progress oldhost:/var/lib/myservice/ newhost:/var/lib/myservice/
|
||||||
|
|
||||||
|
# 3. Start on new host
|
||||||
|
ssh newhost 'systemctl start myservice'
|
||||||
|
|
||||||
|
# 4. Update reverse proxy (if applicable)
|
||||||
|
# Edit nginx config: proxyPass = "http://<new-tailscale-ip>"
|
||||||
|
# Rebuild: ssh proxy 'nixos-rebuild switch'
|
||||||
|
|
||||||
|
# 5. Verify service works
|
||||||
|
|
||||||
|
# 6. Clean up old host (after verification period)
|
||||||
|
ssh oldhost 'rm -rf /var/lib/myservice'
|
||||||
|
```
|
||||||
|
|
||||||
|
**Downtime:** Duration of rsync + service start + proxy update.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Warm Migration (Recommended)
|
||||||
|
|
||||||
|
Best for: Most services with moderate state.
|
||||||
|
|
||||||
|
### Strategy
|
||||||
|
|
||||||
|
1. Sync state while service is running (initial sync)
|
||||||
|
2. Stop service briefly for final sync
|
||||||
|
3. Start on new host
|
||||||
|
4. Update routing
|
||||||
|
|
||||||
|
### Steps
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Initial sync (service still running)
|
||||||
|
rsync -avz --progress oldhost:/var/lib/myservice/ newhost:/var/lib/myservice/
|
||||||
|
|
||||||
|
# 2. Stop service on old host
|
||||||
|
ssh oldhost 'systemctl stop myservice'
|
||||||
|
|
||||||
|
# 3. Final sync (quick - only changes since initial sync)
|
||||||
|
rsync -avz --progress oldhost:/var/lib/myservice/ newhost:/var/lib/myservice/
|
||||||
|
|
||||||
|
# 4. Start on new host
|
||||||
|
ssh newhost 'systemctl start myservice'
|
||||||
|
|
||||||
|
# 5. Update reverse proxy immediately
|
||||||
|
ssh proxy 'nixos-rebuild switch'
|
||||||
|
|
||||||
|
# 6. Verify
|
||||||
|
curl https://myservice.joshuabell.xyz
|
||||||
|
```
|
||||||
|
|
||||||
|
**Downtime:** 2-5 minutes (final rsync + start + proxy switch).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Hot Migration (Database Services)
|
||||||
|
|
||||||
|
Best for: PostgreSQL, critical services requiring near-zero downtime.
|
||||||
|
|
||||||
|
### PostgreSQL Logical Replication
|
||||||
|
|
||||||
|
#### On Source (Old Host)
|
||||||
|
|
||||||
|
```nix
|
||||||
|
services.postgresql = {
|
||||||
|
settings = {
|
||||||
|
wal_level = "logical";
|
||||||
|
max_replication_slots = 4;
|
||||||
|
max_wal_senders = 4;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
# Add replication user
|
||||||
|
services.postgresql.ensureUsers = [{
|
||||||
|
name = "replicator";
|
||||||
|
ensurePermissions."ALL TABLES IN SCHEMA public" = "SELECT";
|
||||||
|
}];
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Set Up Replication
|
||||||
|
|
||||||
|
```sql
|
||||||
|
-- On source: Create publication
|
||||||
|
CREATE PUBLICATION my_pub FOR ALL TABLES;
|
||||||
|
|
||||||
|
-- On target: Create subscription
|
||||||
|
CREATE SUBSCRIPTION my_sub
|
||||||
|
CONNECTION 'host=oldhost dbname=mydb user=replicator'
|
||||||
|
PUBLICATION my_pub;
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Cutover
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Verify replication is caught up
|
||||||
|
# Check lag on target:
|
||||||
|
SELECT * FROM pg_stat_subscription;
|
||||||
|
|
||||||
|
# 2. Stop writes on source (maintenance mode)
|
||||||
|
|
||||||
|
# 3. Wait for final sync
|
||||||
|
|
||||||
|
# 4. Promote target (drop subscription)
|
||||||
|
DROP SUBSCRIPTION my_sub;
|
||||||
|
|
||||||
|
# 5. Update application connection strings
|
||||||
|
|
||||||
|
# 6. Update reverse proxy
|
||||||
|
```
|
||||||
|
|
||||||
|
**Downtime:** <1 minute (just the cutover).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Service-Specific Procedures
|
||||||
|
|
||||||
|
### Forgejo (Git Server)
|
||||||
|
|
||||||
|
**State locations:**
|
||||||
|
- `/var/lib/forgejo/data/` - Git repositories, LFS
|
||||||
|
- `/var/lib/forgejo/postgres/` - PostgreSQL database
|
||||||
|
- `/var/lib/forgejo/backups/` - Existing backups
|
||||||
|
|
||||||
|
**Procedure (Warm Migration):**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Put Forgejo in maintenance mode (optional)
|
||||||
|
ssh h001 'touch /var/lib/forgejo/data/maintenance'
|
||||||
|
|
||||||
|
# 2. Backup database inside container
|
||||||
|
ssh h001 'nixos-container run forgejo -- pg_dumpall -U forgejo > /var/lib/forgejo/backups/pre-migration.sql'
|
||||||
|
|
||||||
|
# 3. Initial sync
|
||||||
|
rsync -avz --progress h001:/var/lib/forgejo/ newhost:/var/lib/forgejo/
|
||||||
|
|
||||||
|
# 4. Stop container
|
||||||
|
ssh h001 'systemctl stop container@forgejo'
|
||||||
|
|
||||||
|
# 5. Final sync
|
||||||
|
rsync -avz --progress h001:/var/lib/forgejo/ newhost:/var/lib/forgejo/
|
||||||
|
|
||||||
|
# 6. Start on new host
|
||||||
|
ssh newhost 'systemctl start container@forgejo'
|
||||||
|
|
||||||
|
# 7. Update O001 nginx
|
||||||
|
# Change: proxyPass = "http://100.64.0.13" → "http://<new-ip>"
|
||||||
|
ssh o001 'nixos-rebuild switch'
|
||||||
|
|
||||||
|
# 8. Verify
|
||||||
|
git clone https://git.joshuabell.xyz/test/repo.git
|
||||||
|
|
||||||
|
# 9. Remove maintenance mode
|
||||||
|
ssh newhost 'rm /var/lib/forgejo/data/maintenance'
|
||||||
|
```
|
||||||
|
|
||||||
|
**Downtime:** ~5 minutes.
|
||||||
|
|
||||||
|
### Zitadel (SSO)
|
||||||
|
|
||||||
|
**State locations:**
|
||||||
|
- `/var/lib/zitadel/postgres/` - PostgreSQL database
|
||||||
|
- `/var/lib/zitadel/backups/` - Backups
|
||||||
|
|
||||||
|
**Critical notes:**
|
||||||
|
- SSO is used by other services - coordinate downtime
|
||||||
|
- Test authentication after migration
|
||||||
|
- May need to clear client caches
|
||||||
|
|
||||||
|
**Procedure:** Same as Forgejo.
|
||||||
|
|
||||||
|
### Vaultwarden (Password Manager)
|
||||||
|
|
||||||
|
**State locations:**
|
||||||
|
- `/var/lib/vaultwarden/` - SQLite database, attachments
|
||||||
|
|
||||||
|
**Critical notes:**
|
||||||
|
- MOST CRITICAL SERVICE - users depend on this constantly
|
||||||
|
- Prefer hot migration or schedule during low-usage time
|
||||||
|
- Verify emergency access works after migration
|
||||||
|
|
||||||
|
**Procedure:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Enable read-only mode (if supported)
|
||||||
|
|
||||||
|
# 2. Sync while running
|
||||||
|
rsync -avz --progress o001:/var/lib/vaultwarden/ newhost:/var/lib/vaultwarden/
|
||||||
|
|
||||||
|
# 3. Quick cutover
|
||||||
|
ssh o001 'systemctl stop vaultwarden'
|
||||||
|
rsync -avz --progress o001:/var/lib/vaultwarden/ newhost:/var/lib/vaultwarden/
|
||||||
|
ssh newhost 'systemctl start vaultwarden'
|
||||||
|
|
||||||
|
# 4. Update DNS/proxy immediately
|
||||||
|
|
||||||
|
# 5. Verify with mobile app and browser extension
|
||||||
|
```
|
||||||
|
|
||||||
|
**Downtime:** 2-3 minutes (coordinate with users).
|
||||||
|
|
||||||
|
### Headscale
|
||||||
|
|
||||||
|
**State locations:**
|
||||||
|
- `/var/lib/headscale/` - SQLite database with node registrations
|
||||||
|
|
||||||
|
**Critical notes:**
|
||||||
|
- ALL mesh connectivity depends on this
|
||||||
|
- Existing connections continue during migration
|
||||||
|
- New connections will fail during downtime
|
||||||
|
|
||||||
|
**Procedure:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Backup current state
|
||||||
|
restic -r /backup/l001 backup /var/lib/headscale --tag pre-migration
|
||||||
|
|
||||||
|
# 2. Sync to new VPS
|
||||||
|
rsync -avz --progress l001:/var/lib/headscale/ newvps:/var/lib/headscale/
|
||||||
|
|
||||||
|
# 3. Stop on old host
|
||||||
|
ssh l001 'systemctl stop headscale'
|
||||||
|
|
||||||
|
# 4. Final sync
|
||||||
|
rsync -avz --progress l001:/var/lib/headscale/ newvps:/var/lib/headscale/
|
||||||
|
|
||||||
|
# 5. Start on new host
|
||||||
|
ssh newvps 'systemctl start headscale'
|
||||||
|
|
||||||
|
# 6. Update DNS
|
||||||
|
# headscale.joshuabell.xyz → new IP
|
||||||
|
|
||||||
|
# 7. Verify
|
||||||
|
headscale nodes list
|
||||||
|
tailscale status
|
||||||
|
|
||||||
|
# 8. Test new device joining
|
||||||
|
```
|
||||||
|
|
||||||
|
**Downtime:** 5-10 minutes (include DNS propagation time).
|
||||||
|
|
||||||
|
### AdGuard Home
|
||||||
|
|
||||||
|
**State locations:**
|
||||||
|
- `/var/lib/AdGuardHome/` - Config, query logs, filters
|
||||||
|
|
||||||
|
**Critical notes:**
|
||||||
|
- LAN DNS will fail during migration
|
||||||
|
- Configure backup DNS on clients first
|
||||||
|
|
||||||
|
**Procedure:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Add temporary DNS to DHCP (e.g., 1.1.1.1)
|
||||||
|
# Or have clients use secondary DNS server
|
||||||
|
|
||||||
|
# 2. Quick migration
|
||||||
|
ssh h003 'systemctl stop adguardhome'
|
||||||
|
rsync -avz --progress h003:/var/lib/AdGuardHome/ newhost:/var/lib/AdGuardHome/
|
||||||
|
ssh newhost 'systemctl start adguardhome'
|
||||||
|
|
||||||
|
# 3. Update DHCP to point to new host
|
||||||
|
|
||||||
|
# 4. Verify DNS resolution
|
||||||
|
dig @new-host-ip google.com
|
||||||
|
```
|
||||||
|
|
||||||
|
**Downtime:** 2-3 minutes (clients use backup DNS).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Reverse Proxy Updates
|
||||||
|
|
||||||
|
When migrating services proxied through O001:
|
||||||
|
|
||||||
|
### Current Proxy Mappings (O001 nginx.nix)
|
||||||
|
|
||||||
|
| Domain | Backend |
|
||||||
|
|--------|---------|
|
||||||
|
| chat.joshuabell.xyz | 100.64.0.13 (H001) |
|
||||||
|
| git.joshuabell.xyz | 100.64.0.13 (H001) |
|
||||||
|
| notes.joshuabell.xyz | 100.64.0.13 (H001) |
|
||||||
|
| sec.joshuabell.xyz | 100.64.0.13 (H001) |
|
||||||
|
| sso.joshuabell.xyz | 100.64.0.13 (H001) |
|
||||||
|
| llm.joshuabell.xyz | 100.64.0.13:8095 (H001) |
|
||||||
|
|
||||||
|
### Updating Proxy
|
||||||
|
|
||||||
|
1. Edit `hosts/oracle/o001/nginx.nix`
|
||||||
|
2. Change `proxyPass` to new Tailscale IP
|
||||||
|
3. Commit and push
|
||||||
|
4. `ssh o001 'cd /etc/nixos && git pull && nixos-rebuild switch'`
|
||||||
|
|
||||||
|
Or for faster updates without commit:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Quick test (non-persistent)
|
||||||
|
ssh o001 'sed -i "s/100.64.0.13/100.64.0.XX/g" /etc/nginx/nginx.conf && nginx -s reload'
|
||||||
|
|
||||||
|
# Then update flake and rebuild properly
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Rollback Procedures
|
||||||
|
|
||||||
|
If migration fails:
|
||||||
|
|
||||||
|
### Quick Rollback
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Stop on new host
|
||||||
|
ssh newhost 'systemctl stop myservice'
|
||||||
|
|
||||||
|
# 2. Start on old host (state should still be there)
|
||||||
|
ssh oldhost 'systemctl start myservice'
|
||||||
|
|
||||||
|
# 3. Revert proxy changes
|
||||||
|
ssh proxy 'nixos-rebuild switch --rollback'
|
||||||
|
```
|
||||||
|
|
||||||
|
### If Old State Was Deleted
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Restore from backup
|
||||||
|
restic -r /backup/oldhost restore latest --target / --include /var/lib/myservice
|
||||||
|
|
||||||
|
# Start service
|
||||||
|
systemctl start myservice
|
||||||
|
|
||||||
|
# Revert proxy
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Post-Migration Checklist
|
||||||
|
|
||||||
|
- [ ] Service responds correctly
|
||||||
|
- [ ] Authentication works (if applicable)
|
||||||
|
- [ ] Data integrity verified
|
||||||
|
- [ ] Monitoring updated to new host
|
||||||
|
- [ ] DNS/proxy pointing to new location
|
||||||
|
- [ ] Old host state cleaned up (after grace period)
|
||||||
|
- [ ] Backup job updated for new location
|
||||||
|
- [ ] Documentation updated
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Common Issues
|
||||||
|
|
||||||
|
### "Permission denied" on New Host
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Ensure correct ownership
|
||||||
|
chown -R serviceuser:servicegroup /var/lib/myservice
|
||||||
|
|
||||||
|
# Check SELinux/AppArmor if applicable
|
||||||
|
```
|
||||||
|
|
||||||
|
### Service Can't Connect to Database
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Verify PostgreSQL is running
|
||||||
|
systemctl status postgresql
|
||||||
|
|
||||||
|
# Check connection settings
|
||||||
|
cat /var/lib/myservice/config.yaml | grep -i database
|
||||||
|
```
|
||||||
|
|
||||||
|
### SSL Certificate Issues
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Certificates are tied to domain, not host
|
||||||
|
# Should work automatically if domain unchanged
|
||||||
|
|
||||||
|
# If issues, force ACME renewal
|
||||||
|
systemctl restart acme-myservice.joshuabell.xyz.service
|
||||||
|
```
|
||||||
|
|
||||||
|
### Tailscale IP Changed
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Get new Tailscale IP
|
||||||
|
tailscale ip -4
|
||||||
|
|
||||||
|
# Update all references to old IP
|
||||||
|
grep -r "100.64.0.XX" /etc/nixos/
|
||||||
|
```
|
||||||
6
hosts/lio/flake.lock
generated
6
hosts/lio/flake.lock
generated
|
|
@ -1303,11 +1303,11 @@
|
||||||
"nixpkgs": "nixpkgs_4"
|
"nixpkgs": "nixpkgs_4"
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1769556375,
|
"lastModified": 1769663859,
|
||||||
"narHash": "sha256-Ne2wFTs2fPyyDUIqy/XiYUmnqs6aaNE8/JA6BVBP+Ow=",
|
"narHash": "sha256-5otcBVNOhDtHjHoDzRSf1iN2/wToLGRgNppx44tlRj4=",
|
||||||
"owner": "anomalyco",
|
"owner": "anomalyco",
|
||||||
"repo": "opencode",
|
"repo": "opencode",
|
||||||
"rev": "15ffd3cba1d3bd7d4d84c6911623a9c1d19e6647",
|
"rev": "41ea4694db7636ba184d238fd2a00deb770f9c0b",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
|
||||||
456
ideas/impermanence_everywhere.md
Normal file
456
ideas/impermanence_everywhere.md
Normal file
|
|
@ -0,0 +1,456 @@
|
||||||
|
# Impermanence Rollout Strategy
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
This document covers rolling out impermanence (ephemeral root filesystem) to all hosts, using Juni as the template.
|
||||||
|
|
||||||
|
## What is Impermanence?
|
||||||
|
|
||||||
|
**Philosophy:** Root filesystem (`/`) is wiped on every boot (tmpfs or reset subvolume), forcing you to explicitly declare what state to persist.
|
||||||
|
|
||||||
|
**Benefits:**
|
||||||
|
- Clean system by default - no accumulated cruft
|
||||||
|
- Forces documentation of important state
|
||||||
|
- Easy rollback (just reboot)
|
||||||
|
- Security (ephemeral root limits persistence of compromises)
|
||||||
|
- Reproducible server state
|
||||||
|
|
||||||
|
## Current State
|
||||||
|
|
||||||
|
| Host | Impermanence | Notes |
|
||||||
|
|------|--------------|-------|
|
||||||
|
| Juni | ✅ Implemented | bcachefs with @root/@persist subvolumes |
|
||||||
|
| H001 | ❌ Traditional | Most complex - many services |
|
||||||
|
| H002 | ❌ Traditional | NAS - may not need impermanence |
|
||||||
|
| H003 | ❌ Traditional | Router - good candidate |
|
||||||
|
| O001 | ❌ Traditional | Gateway - good candidate |
|
||||||
|
| L001 | ❌ Traditional | Headscale - good candidate |
|
||||||
|
|
||||||
|
## Juni's Implementation (Reference)
|
||||||
|
|
||||||
|
### Filesystem Layout
|
||||||
|
|
||||||
|
```
|
||||||
|
bcachefs (5 devices, 2x replication)
|
||||||
|
├── @root # Ephemeral - reset each boot
|
||||||
|
├── @nix # Persistent - Nix store
|
||||||
|
├── @persist # Persistent - bind mounts for state
|
||||||
|
└── @snapshots # Automatic snapshots
|
||||||
|
```
|
||||||
|
|
||||||
|
### Boot Process
|
||||||
|
|
||||||
|
1. Create snapshot of @root before reset
|
||||||
|
2. Reset @root subvolume (or recreate)
|
||||||
|
3. Boot into clean system
|
||||||
|
4. Bind mount persisted paths from @persist
|
||||||
|
|
||||||
|
### Persisted Paths (Juni)
|
||||||
|
|
||||||
|
```nix
|
||||||
|
environment.persistence."/persist" = {
|
||||||
|
hideMounts = true;
|
||||||
|
|
||||||
|
directories = [
|
||||||
|
"/var/log"
|
||||||
|
"/var/lib/nixos"
|
||||||
|
"/var/lib/systemd"
|
||||||
|
"/var/lib/tailscale"
|
||||||
|
"/var/lib/flatpak"
|
||||||
|
"/etc/NetworkManager/system-connections"
|
||||||
|
];
|
||||||
|
|
||||||
|
files = [
|
||||||
|
"/etc/machine-id"
|
||||||
|
"/etc/ssh/ssh_host_ed25519_key"
|
||||||
|
"/etc/ssh/ssh_host_ed25519_key.pub"
|
||||||
|
"/etc/ssh/ssh_host_rsa_key"
|
||||||
|
"/etc/ssh/ssh_host_rsa_key.pub"
|
||||||
|
];
|
||||||
|
|
||||||
|
users.josh = {
|
||||||
|
directories = [
|
||||||
|
".ssh"
|
||||||
|
".gnupg"
|
||||||
|
"projects"
|
||||||
|
".config"
|
||||||
|
".local/share"
|
||||||
|
];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
### Custom Tooling
|
||||||
|
|
||||||
|
Juni has `bcache-impermanence` with commands:
|
||||||
|
- `ls` - List snapshots
|
||||||
|
- `gc` - Garbage collect old snapshots
|
||||||
|
- `diff` - Show changes since last boot (auto-excludes persisted paths)
|
||||||
|
|
||||||
|
Retention policy: 5 recent + 1/week for 4 weeks + 1/month
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Common Pain Point: Finding What Needs Persistence
|
||||||
|
|
||||||
|
> "I often have issues adding new persistent layers and knowing what I need to add"
|
||||||
|
|
||||||
|
### Discovery Workflow
|
||||||
|
|
||||||
|
#### Method 1: Use the Diff Tool
|
||||||
|
|
||||||
|
Before rebooting after installing new software:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# On Juni
|
||||||
|
bcache-impermanence diff
|
||||||
|
```
|
||||||
|
|
||||||
|
This shows files created/modified outside persisted paths.
|
||||||
|
|
||||||
|
#### Method 2: Boot and Observe Failures
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# After reboot, check for failures
|
||||||
|
journalctl -b | grep -i "no such file"
|
||||||
|
journalctl -b | grep -i "failed to"
|
||||||
|
journalctl -b | grep -i "permission denied"
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Method 3: Monitor File Changes
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Before making changes
|
||||||
|
find /var /etc -type f -printf '%T@ %p\n' 2>/dev/null | sort -n > /tmp/before.txt
|
||||||
|
|
||||||
|
# After running services
|
||||||
|
find /var /etc -type f -printf '%T@ %p\n' 2>/dev/null | sort -n > /tmp/after.txt
|
||||||
|
|
||||||
|
# Compare
|
||||||
|
diff /tmp/before.txt /tmp/after.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Method 4: Service-Specific Patterns
|
||||||
|
|
||||||
|
Most services follow predictable patterns:
|
||||||
|
|
||||||
|
| Pattern | Example | Usually Needs Persistence |
|
||||||
|
|---------|---------|---------------------------|
|
||||||
|
| `/var/lib/${service}` | `/var/lib/postgresql` | Yes |
|
||||||
|
| `/var/cache/${service}` | `/var/cache/nginx` | Usually no |
|
||||||
|
| `/var/log/${service}` | `/var/log/nginx` | Optional |
|
||||||
|
| `/etc/${service}` | `/etc/nginx` | Only if runtime-generated |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Server Impermanence Template
|
||||||
|
|
||||||
|
### Minimal Server Persistence
|
||||||
|
|
||||||
|
```nix
|
||||||
|
environment.persistence."/persist" = {
|
||||||
|
hideMounts = true;
|
||||||
|
|
||||||
|
directories = [
|
||||||
|
# Core system
|
||||||
|
"/var/lib/nixos" # NixOS state DB
|
||||||
|
"/var/lib/systemd/coredump"
|
||||||
|
"/var/log"
|
||||||
|
|
||||||
|
# Network
|
||||||
|
"/var/lib/tailscale"
|
||||||
|
"/etc/NetworkManager/system-connections"
|
||||||
|
|
||||||
|
# ACME certificates
|
||||||
|
"/var/lib/acme"
|
||||||
|
];
|
||||||
|
|
||||||
|
files = [
|
||||||
|
"/etc/machine-id"
|
||||||
|
"/etc/ssh/ssh_host_ed25519_key"
|
||||||
|
"/etc/ssh/ssh_host_ed25519_key.pub"
|
||||||
|
"/etc/ssh/ssh_host_rsa_key"
|
||||||
|
"/etc/ssh/ssh_host_rsa_key.pub"
|
||||||
|
];
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
### Per-Host Additions
|
||||||
|
|
||||||
|
#### H001 (Services)
|
||||||
|
|
||||||
|
```nix
|
||||||
|
environment.persistence."/persist".directories = [
|
||||||
|
# Add to minimal template:
|
||||||
|
"/var/lib/forgejo"
|
||||||
|
"/var/lib/zitadel"
|
||||||
|
"/var/lib/openbao"
|
||||||
|
"/bao-keys"
|
||||||
|
"/var/lib/trilium"
|
||||||
|
"/var/lib/opengist"
|
||||||
|
"/var/lib/open-webui"
|
||||||
|
"/var/lib/n8n"
|
||||||
|
"/var/lib/nixarr/state"
|
||||||
|
"/var/lib/containers" # Podman/container state
|
||||||
|
];
|
||||||
|
```
|
||||||
|
|
||||||
|
#### O001 (Gateway)
|
||||||
|
|
||||||
|
```nix
|
||||||
|
environment.persistence."/persist".directories = [
|
||||||
|
# Add to minimal template:
|
||||||
|
"/var/lib/vaultwarden"
|
||||||
|
"/var/lib/postgresql"
|
||||||
|
"/var/lib/fail2ban"
|
||||||
|
];
|
||||||
|
```
|
||||||
|
|
||||||
|
#### L001 (Headscale)
|
||||||
|
|
||||||
|
```nix
|
||||||
|
environment.persistence."/persist".directories = [
|
||||||
|
# Add to minimal template:
|
||||||
|
"/var/lib/headscale"
|
||||||
|
];
|
||||||
|
```
|
||||||
|
|
||||||
|
#### H003 (Router)
|
||||||
|
|
||||||
|
```nix
|
||||||
|
environment.persistence."/persist".directories = [
|
||||||
|
# Add to minimal template:
|
||||||
|
"/var/lib/AdGuardHome"
|
||||||
|
"/var/lib/dnsmasq"
|
||||||
|
];
|
||||||
|
|
||||||
|
environment.persistence."/persist".files = [
|
||||||
|
# Add to minimal template:
|
||||||
|
"/boot/keyfile_nvme0n1p1" # LUKS key - CRITICAL
|
||||||
|
];
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Rollout Strategy
|
||||||
|
|
||||||
|
### Phase 1: Lowest Risk (VPS Hosts)
|
||||||
|
|
||||||
|
Start with L001 and O001:
|
||||||
|
- Easy to rebuild from scratch if something goes wrong
|
||||||
|
- Smaller state footprint
|
||||||
|
- Good practice before tackling complex hosts
|
||||||
|
|
||||||
|
**L001 Steps:**
|
||||||
|
1. Back up `/var/lib/headscale/`
|
||||||
|
2. Add impermanence module
|
||||||
|
3. Test on spare VPS first
|
||||||
|
4. Migrate
|
||||||
|
|
||||||
|
**O001 Steps:**
|
||||||
|
1. Back up Vaultwarden and PostgreSQL
|
||||||
|
2. Add impermanence module
|
||||||
|
3. Test carefully (Vaultwarden is critical!)
|
||||||
|
|
||||||
|
### Phase 2: Router (H003)
|
||||||
|
|
||||||
|
H003 is medium complexity:
|
||||||
|
- Relatively small state
|
||||||
|
- But critical for network (test during maintenance window)
|
||||||
|
- LUKS keyfile needs special handling
|
||||||
|
|
||||||
|
### Phase 3: Complex Host (H001)
|
||||||
|
|
||||||
|
H001 is most complex due to:
|
||||||
|
- Multiple containerized services
|
||||||
|
- Database state in containers
|
||||||
|
- Many stateful applications
|
||||||
|
|
||||||
|
**Approach:**
|
||||||
|
1. Inventory all state paths (see backup docs)
|
||||||
|
2. Test with snapshot before committing
|
||||||
|
3. Gradual rollout with extensive persistence list
|
||||||
|
4. May need to persist more than expected initially
|
||||||
|
|
||||||
|
### Phase 4: NAS (H002) - Maybe Skip
|
||||||
|
|
||||||
|
H002 may not benefit from impermanence:
|
||||||
|
- Primary purpose is persistent data storage
|
||||||
|
- bcachefs replication already provides redundancy
|
||||||
|
- Impermanence adds complexity without clear benefit
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Filesystem Options
|
||||||
|
|
||||||
|
### Option A: bcachefs with Subvolumes (Like Juni)
|
||||||
|
|
||||||
|
**Pros:**
|
||||||
|
- Flexible, modern
|
||||||
|
- Built-in snapshots
|
||||||
|
- Replication support
|
||||||
|
|
||||||
|
**Setup:**
|
||||||
|
```nix
|
||||||
|
fileSystems = {
|
||||||
|
"/" = {
|
||||||
|
device = "/dev/disk/by-label/nixos";
|
||||||
|
fsType = "bcachefs";
|
||||||
|
options = [ "subvol=@root" ];
|
||||||
|
};
|
||||||
|
"/nix" = {
|
||||||
|
device = "/dev/disk/by-label/nixos";
|
||||||
|
fsType = "bcachefs";
|
||||||
|
options = [ "subvol=@nix" ];
|
||||||
|
};
|
||||||
|
"/persist" = {
|
||||||
|
device = "/dev/disk/by-label/nixos";
|
||||||
|
fsType = "bcachefs";
|
||||||
|
options = [ "subvol=@persist" ];
|
||||||
|
neededForBoot = true;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
### Option B: BTRFS with Subvolumes
|
||||||
|
|
||||||
|
Similar to bcachefs but more mature:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
# Reset @root on boot
|
||||||
|
boot.initrd.postDeviceCommands = lib.mkAfter ''
|
||||||
|
mkdir -p /mnt
|
||||||
|
mount -o subvol=/ /dev/disk/by-label/nixos /mnt
|
||||||
|
btrfs subvolume delete /mnt/@root
|
||||||
|
btrfs subvolume create /mnt/@root
|
||||||
|
umount /mnt
|
||||||
|
'';
|
||||||
|
```
|
||||||
|
|
||||||
|
### Option C: tmpfs Root
|
||||||
|
|
||||||
|
Simplest but uses RAM:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
fileSystems."/" = {
|
||||||
|
device = "none";
|
||||||
|
fsType = "tmpfs";
|
||||||
|
options = [ "defaults" "size=2G" "mode=755" ];
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
**Best for:** VPS hosts with limited disk but adequate RAM.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Service Fails After Reboot
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check what's missing
|
||||||
|
journalctl -xeu servicename
|
||||||
|
|
||||||
|
# Common fixes:
|
||||||
|
# 1. Add /var/lib/servicename to persistence
|
||||||
|
# 2. Ensure directory permissions are correct
|
||||||
|
# 3. Check if service expects specific files in /etc
|
||||||
|
```
|
||||||
|
|
||||||
|
### "No such file or directory" Errors
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Find what's missing
|
||||||
|
journalctl -b | grep "No such file"
|
||||||
|
|
||||||
|
# Add missing paths to persistence
|
||||||
|
```
|
||||||
|
|
||||||
|
### Slow Boot (Too Many Bind Mounts)
|
||||||
|
|
||||||
|
If you have many persisted paths, consider:
|
||||||
|
1. Consolidating related paths
|
||||||
|
2. Using symlinks instead of bind mounts for some paths
|
||||||
|
3. Persisting parent directories instead of many children
|
||||||
|
|
||||||
|
### Container State Issues
|
||||||
|
|
||||||
|
Containers may have their own state directories:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
# For NixOS containers
|
||||||
|
environment.persistence."/persist".directories = [
|
||||||
|
"/var/lib/nixos-containers"
|
||||||
|
];
|
||||||
|
|
||||||
|
# For Podman
|
||||||
|
environment.persistence."/persist".directories = [
|
||||||
|
"/var/lib/containers/storage/volumes"
|
||||||
|
# NOT overlay - that's regenerated
|
||||||
|
];
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Tooling Improvements
|
||||||
|
|
||||||
|
### Automated Discovery Script
|
||||||
|
|
||||||
|
Create a helper that runs periodically to detect unpersisted changes:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
# /usr/local/bin/impermanence-check
|
||||||
|
|
||||||
|
# Get list of persisted paths
|
||||||
|
PERSISTED=$(nix eval --raw '.#nixosConfigurations.hostname.config.environment.persistence."/persist".directories' 2>/dev/null | tr -d '[]"' | tr ' ' '\n')
|
||||||
|
|
||||||
|
# Find modified files outside persisted paths
|
||||||
|
find / -xdev -type f -mmin -60 2>/dev/null | while read -r file; do
|
||||||
|
is_persisted=false
|
||||||
|
for path in $PERSISTED; do
|
||||||
|
if [[ "$file" == "$path"* ]]; then
|
||||||
|
is_persisted=true
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
if ! $is_persisted; then
|
||||||
|
echo "UNPERSISTED: $file"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
```
|
||||||
|
|
||||||
|
### Pre-Reboot Check
|
||||||
|
|
||||||
|
Add to your workflow:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Before rebooting
|
||||||
|
bcache-impermanence diff # or custom script
|
||||||
|
|
||||||
|
# Review changes, add to persistence if needed, then reboot
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Action Items
|
||||||
|
|
||||||
|
### Immediate
|
||||||
|
- [ ] Document all state paths for each host (see backup docs)
|
||||||
|
- [ ] Create shared impermanence module in flake
|
||||||
|
|
||||||
|
### Phase 1 (L001/O001)
|
||||||
|
- [ ] Back up current state
|
||||||
|
- [ ] Add impermanence to L001
|
||||||
|
- [ ] Test thoroughly
|
||||||
|
- [ ] Roll out to O001
|
||||||
|
|
||||||
|
### Phase 2 (H003)
|
||||||
|
- [ ] Plan maintenance window
|
||||||
|
- [ ] Add impermanence to H003
|
||||||
|
- [ ] Verify LUKS key persistence
|
||||||
|
|
||||||
|
### Phase 3 (H001)
|
||||||
|
- [ ] Complete state inventory
|
||||||
|
- [ ] Test with extensive persistence list
|
||||||
|
- [ ] Gradual rollout
|
||||||
208
ideas/openbao_secrets_migration.md
Normal file
208
ideas/openbao_secrets_migration.md
Normal file
|
|
@ -0,0 +1,208 @@
|
||||||
|
# OpenBao Secrets Migration
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
This document covers migrating from ragenix (age-encrypted secrets) to OpenBao for centralized secret management, enabling zero-config machine onboarding.
|
||||||
|
|
||||||
|
## Goals
|
||||||
|
|
||||||
|
1. **Zero-config machine onboarding**: New machine = install NixOS + add Zitadel machine key + done
|
||||||
|
2. **Eliminate re-keying workflow**: No more updating secrets.nix and re-encrypting .age files for each new machine
|
||||||
|
3. **Runtime secret dependencies**: Services wait for secrets via systemd, not build-time conditionals
|
||||||
|
4. **Consolidated SSH keys**: Use single `nix2nix` key for all NixOS machine SSH (keep `nix2t` for work)
|
||||||
|
5. **Declarative policy management**: OpenBao policies auto-applied after unseal with reconciliation
|
||||||
|
6. **Directional Tailscale ACLs**: Restrict work machine from reaching NixOS hosts (one-way access)
|
||||||
|
7. **Per-host variable registry**: `_variables.nix` pattern for ports/UIDs/GIDs to prevent conflicts
|
||||||
|
|
||||||
|
## Current State
|
||||||
|
|
||||||
|
### Ragenix Secrets in Use (21 active)
|
||||||
|
|
||||||
|
**SSH Keys (for client auth):**
|
||||||
|
- nix2github, nix2bitbucket, nix2gitforgejo
|
||||||
|
- nix2nix (shared), nix2t (work - keep separate)
|
||||||
|
- nix2lio (remote builds), nix2oren, nix2gpdPocket3
|
||||||
|
- nix2h001, nix2h003, nix2linode, nix2oracle
|
||||||
|
|
||||||
|
**API Tokens:**
|
||||||
|
- github_read_token (Nix private repo access)
|
||||||
|
- linode_rw_domains (ACME DNS challenge)
|
||||||
|
- litellm_public_api_key (nginx auth)
|
||||||
|
|
||||||
|
**VPN:**
|
||||||
|
- headscale_auth (Tailscale auth)
|
||||||
|
- us_chi_wg (NixArr WireGuard)
|
||||||
|
|
||||||
|
**Application Secrets:**
|
||||||
|
- oauth2_proxy_key_file
|
||||||
|
- openwebui_env
|
||||||
|
- zitadel_master_key
|
||||||
|
- vaultwarden_env
|
||||||
|
|
||||||
|
**Skipping (unused):**
|
||||||
|
- nix2h002, nix2joe, nix2l002, nix2gitjosh, obsidian_sync_env
|
||||||
|
|
||||||
|
### Already Migrated to OpenBao (juni)
|
||||||
|
- headscale_auth, atuin-key-josh, 12 SSH keys
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────────────────────────────────────────────────────┐
|
||||||
|
│ New Machine Onboarding │
|
||||||
|
├─────────────────────────────────────────────────────────────────┤
|
||||||
|
│ │
|
||||||
|
│ 1. Install NixOS with full config │
|
||||||
|
│ - All services defined but waiting on secrets │
|
||||||
|
│ │
|
||||||
|
│ 2. Create Zitadel machine user + copy key │
|
||||||
|
│ - /machine-key.json → JWT auth to OpenBao │
|
||||||
|
│ │
|
||||||
|
│ 3. vault-agent fetches secrets │
|
||||||
|
│ - kv/data/machines/home_roaming/* → /var/lib/openbao-secrets│
|
||||||
|
│ │
|
||||||
|
│ 4. systemd dependencies resolve │
|
||||||
|
│ - secret-watcher completes → hardDepend services start │
|
||||||
|
│ │
|
||||||
|
│ 5. Machine fully operational │
|
||||||
|
│ │
|
||||||
|
└─────────────────────────────────────────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
## Key Design Decisions
|
||||||
|
|
||||||
|
### Secret Path Convention
|
||||||
|
|
||||||
|
```
|
||||||
|
kv/data/machines/
|
||||||
|
├── home_roaming/ # Shared across all NixOS machines
|
||||||
|
│ ├── nix2nix # SSH key
|
||||||
|
│ ├── nix2github # SSH key
|
||||||
|
│ ├── headscale_auth # Tailscale auth
|
||||||
|
│ └── ...
|
||||||
|
├── home/ # h001-specific (not roaming)
|
||||||
|
│ ├── linode_rw_domains
|
||||||
|
│ ├── zitadel_master_key
|
||||||
|
│ └── ...
|
||||||
|
└── oracle/ # o001-specific
|
||||||
|
├── vaultwarden_env
|
||||||
|
└── ...
|
||||||
|
```
|
||||||
|
|
||||||
|
### Runtime Dependencies vs Build-Time Conditionals
|
||||||
|
|
||||||
|
**Before (ragenix pattern - bad for onboarding):**
|
||||||
|
```nix
|
||||||
|
let hasSecret = name: (config.age.secrets or {}) ? ${name};
|
||||||
|
in {
|
||||||
|
config = lib.mkIf (hasSecret "openwebui_env") {
|
||||||
|
services.open-webui.enable = true;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**After (OpenBao pattern - zero-config onboarding):**
|
||||||
|
```nix
|
||||||
|
ringofstorms.secretsBao.secrets.openwebui_env = {
|
||||||
|
kvPath = "kv/data/machines/home_roaming/openwebui_env";
|
||||||
|
hardDepend = [ "open-webui" ]; # Service waits for secret at runtime
|
||||||
|
configChanges.services.open-webui = {
|
||||||
|
enable = true;
|
||||||
|
environmentFile = "$SECRET_PATH";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
### Per-Host File Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
hosts/h001/
|
||||||
|
├── _variables.nix # Ports, UIDs, GIDs - single source of truth
|
||||||
|
├── secrets.nix # All secrets + their configChanges
|
||||||
|
├── flake.nix # Imports, basic host config
|
||||||
|
├── nginx.nix # Pure config (no conditionals)
|
||||||
|
└── mods/
|
||||||
|
├── openbao-policies.nix # Auto-apply after unseal
|
||||||
|
└── ...
|
||||||
|
```
|
||||||
|
|
||||||
|
### OpenBao Policy Management
|
||||||
|
|
||||||
|
Policies auto-apply after unseal with full reconciliation:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
# openbao-policies.nix
|
||||||
|
let
|
||||||
|
policies = {
|
||||||
|
machines = ''
|
||||||
|
path "kv/data/machines/home_roaming/*" {
|
||||||
|
capabilities = ["read", "list"]
|
||||||
|
}
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
reservedPolicies = [ "default" "root" ];
|
||||||
|
in {
|
||||||
|
systemd.services.openbao-apply-policies = {
|
||||||
|
after = [ "openbao-auto-unseal.service" ];
|
||||||
|
requires = [ "openbao-auto-unseal.service" ];
|
||||||
|
wantedBy = [ "multi-user.target" ];
|
||||||
|
# Script: apply all policies, delete orphans not in config
|
||||||
|
};
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Headscale ACL Policy
|
||||||
|
|
||||||
|
Directional access control:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
# nix machines: full mesh access
|
||||||
|
{ action = "accept"; src = ["group:nix-machines"]; dst = ["group:nix-machines:*"]; }
|
||||||
|
|
||||||
|
# nix machines → work: full access
|
||||||
|
{ action = "accept"; src = ["group:nix-machines"]; dst = ["tag:work:*"]; }
|
||||||
|
|
||||||
|
# work → nix machines: LIMITED (only specific ports)
|
||||||
|
{ action = "accept"; src = ["tag:work"]; dst = ["h001:22,443"]; }
|
||||||
|
```
|
||||||
|
|
||||||
|
## Implementation Phases
|
||||||
|
|
||||||
|
### Phase 1: SSH Key Preparation
|
||||||
|
- [ ] Add nix2nix SSH key to all hosts authorized_keys (alongside existing)
|
||||||
|
- [ ] Deploy with `nh os switch` to all hosts
|
||||||
|
|
||||||
|
### Phase 2: Infrastructure
|
||||||
|
- [ ] Create `_variables.nix` pattern for h001 (pilot)
|
||||||
|
- [ ] Create `openbao-policies.nix` with auto-apply + reconciliation
|
||||||
|
- [ ] Create `headscale-policy.nix` with directional ACLs
|
||||||
|
- [ ] Create per-host `secrets.nix` pattern
|
||||||
|
|
||||||
|
### Phase 3: Secret Migration
|
||||||
|
- [ ] Migrate h001 secrets (linode_rw_domains, us_chi_wg, oauth2_proxy_key_file, openwebui_env, zitadel_master_key)
|
||||||
|
- [ ] Migrate o001 secrets (vaultwarden_env, litellm_public_api_key)
|
||||||
|
- [ ] Migrate common modules (tailnet, ssh, nix_options)
|
||||||
|
- [ ] Migrate SSH client keys
|
||||||
|
|
||||||
|
### Phase 4: Consumer Updates
|
||||||
|
- [ ] Update ssh.nix to use OpenBao paths
|
||||||
|
- [ ] Remove hasSecret conditionals from all modules
|
||||||
|
- [ ] Remove ragenix imports and secrets flake
|
||||||
|
|
||||||
|
### Phase 5: Testing & Finalization
|
||||||
|
- [ ] Populate all secrets in OpenBao KV store
|
||||||
|
- [ ] Test onboarding workflow on fresh VM
|
||||||
|
- [ ] Document new machine onboarding process
|
||||||
|
|
||||||
|
## Related Ideas
|
||||||
|
|
||||||
|
- `impermanence_everywhere.md` - Impermanence persists `/var/lib/openbao-secrets` and `/machine-key.json`
|
||||||
|
- `resilience.md` - OpenBao server (h001) is a SPOF; consider backup/failover
|
||||||
|
- `service_backups.md` - `/var/lib/openbao` and `/bao-keys` need backup
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- OpenBao hosted on h001 at sec.joshuabell.xyz
|
||||||
|
- JWT auth via Zitadel machine users
|
||||||
|
- vault-agent on each host fetches secrets
|
||||||
|
- `sec` CLI tool available for manual lookups
|
||||||
347
ideas/resilience.md
Normal file
347
ideas/resilience.md
Normal file
|
|
@ -0,0 +1,347 @@
|
||||||
|
# Infrastructure Resilience & Failover
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
This document covers strategies for eliminating single points of failure and improving infrastructure resilience.
|
||||||
|
|
||||||
|
## Current Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
INTERNET
|
||||||
|
│
|
||||||
|
┌─────────┴─────────┐
|
||||||
|
│ │
|
||||||
|
┌─────▼─────┐ ┌──────▼──────┐
|
||||||
|
│ O001 │ │ L001 │
|
||||||
|
│ (Oracle) │ │ (Linode) │
|
||||||
|
│ nginx │ │ Headscale │
|
||||||
|
│ +vault │ │ (SPOF!) │
|
||||||
|
│ +atuin │ └──────┬──────┘
|
||||||
|
│ (SPOF!) │ │
|
||||||
|
└─────┬─────┘ │
|
||||||
|
│ Tailscale Mesh
|
||||||
|
│ ┌───────────┴───────────┐
|
||||||
|
│ │ │
|
||||||
|
┌─────▼───────▼─────┐ ┌──────▼──────┐
|
||||||
|
│ H001 │ │ H003 │
|
||||||
|
│ (Service Host) │ │ (Router) │
|
||||||
|
│ Forgejo,Zitadel, │ │ AdGuard, │
|
||||||
|
│ LiteLLM,Trilium, │ │ DHCP,NAT │
|
||||||
|
│ NixArr,OpenWebUI │ │ (SPOF!) │
|
||||||
|
└─────────┬─────────┘ └─────────────┘
|
||||||
|
│ NFS
|
||||||
|
┌─────────▼─────────┐
|
||||||
|
│ H002 │
|
||||||
|
│ (NAS - bcachefs)│
|
||||||
|
│ Media, Data │
|
||||||
|
└───────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
## Critical Single Points of Failure
|
||||||
|
|
||||||
|
| Host | Service | Impact if Down | Recovery Time |
|
||||||
|
|------|---------|----------------|---------------|
|
||||||
|
| **L001** | Headscale | ALL mesh connectivity | HIGH - must restore SQLite exactly |
|
||||||
|
| **O001** | nginx/Vaultwarden | All public access, password manager | MEDIUM |
|
||||||
|
| **H003** | DNS/DHCP/NAT | Entire LAN offline | MEDIUM |
|
||||||
|
| **H001** | All services | Services down but recoverable | MEDIUM |
|
||||||
|
| **H002** | NFS | Media unavailable | LOW - bcachefs has replication |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Reverse Proxy Resilience (O001)
|
||||||
|
|
||||||
|
### Current Problem
|
||||||
|
|
||||||
|
O001 is a single point of failure for all public traffic:
|
||||||
|
- No public access to any service if it dies
|
||||||
|
- DNS still points to it after failure
|
||||||
|
- ACME certs are only on that host
|
||||||
|
|
||||||
|
### Solution Options
|
||||||
|
|
||||||
|
#### Option A: Cloudflare Tunnel (Recommended Quick Win)
|
||||||
|
|
||||||
|
**Pros:**
|
||||||
|
- No single server dependency
|
||||||
|
- Run `cloudflared` on multiple hosts (H001 as backup)
|
||||||
|
- Automatic failover between tunnel replicas
|
||||||
|
- Built-in DDoS protection
|
||||||
|
- No inbound ports needed
|
||||||
|
|
||||||
|
**Cons:**
|
||||||
|
- Cannot stream media (Jellyfin) - violates Cloudflare ToS
|
||||||
|
- Adds latency
|
||||||
|
- Vendor dependency
|
||||||
|
|
||||||
|
**Implementation:**
|
||||||
|
|
||||||
|
```nix
|
||||||
|
# On BOTH O001 (primary) AND H001 (backup)
|
||||||
|
services.cloudflared = {
|
||||||
|
enable = true;
|
||||||
|
tunnels."joshuabell" = {
|
||||||
|
credentialsFile = config.age.secrets.cloudflared.path;
|
||||||
|
ingress = {
|
||||||
|
"chat.joshuabell.xyz" = "http://100.64.0.13:80";
|
||||||
|
"git.joshuabell.xyz" = "http://100.64.0.13:80";
|
||||||
|
"notes.joshuabell.xyz" = "http://100.64.0.13:80";
|
||||||
|
"sec.joshuabell.xyz" = "http://100.64.0.13:80";
|
||||||
|
"sso.joshuabell.xyz" = "http://100.64.0.13:80";
|
||||||
|
"n8n.joshuabell.xyz" = "http://100.64.0.13:80";
|
||||||
|
"blog.joshuabell.xyz" = "http://100.64.0.13:80";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
Cloudflare automatically load balances across all active tunnel replicas.
|
||||||
|
|
||||||
|
#### Option B: DNS Failover with Health Checks
|
||||||
|
|
||||||
|
Use Cloudflare DNS with health checks:
|
||||||
|
- Point `joshuabell.xyz` to both O001 and a backup
|
||||||
|
- Cloudflare removes unhealthy IPs automatically
|
||||||
|
- Requires Cloudflare paid plan for load balancing
|
||||||
|
|
||||||
|
#### Option C: Tailscale Funnel
|
||||||
|
|
||||||
|
Expose services directly without O001:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# On H001
|
||||||
|
tailscale funnel 443
|
||||||
|
```
|
||||||
|
|
||||||
|
Exposes H001 directly at `https://h001.net.joshuabell.xyz`
|
||||||
|
|
||||||
|
**Pros:**
|
||||||
|
- No proxy needed
|
||||||
|
- Per-service granularity
|
||||||
|
- Automatic HTTPS
|
||||||
|
|
||||||
|
**Cons:**
|
||||||
|
- Uses `ts.net` domain (no custom domain)
|
||||||
|
- Limited to ports 443, 8443, 10000
|
||||||
|
|
||||||
|
#### Option D: Manual Failover with Shared Config
|
||||||
|
|
||||||
|
Keep H001 ready to take over O001's role:
|
||||||
|
1. Same nginx config via shared NixOS module
|
||||||
|
2. Use DNS-01 ACME challenge (certs work on any host)
|
||||||
|
3. Update DNS when O001 fails
|
||||||
|
|
||||||
|
### Recommended Hybrid Approach
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────────────────────────────────────────────────┐
|
||||||
|
│ RECOMMENDED TOPOLOGY │
|
||||||
|
├─────────────────────────────────────────────────────────────┤
|
||||||
|
│ │
|
||||||
|
│ Cloudflare DNS (health-checked failover) │
|
||||||
|
│ │ │
|
||||||
|
│ ┌──────┴──────┐ │
|
||||||
|
│ │ │ │
|
||||||
|
│ ▼ ▼ │
|
||||||
|
│ O001 ──OR── H001 (via Cloudflare Tunnel) │
|
||||||
|
│ nginx cloudflared backup │
|
||||||
|
│ │
|
||||||
|
│ Jellyfin: Direct via Tailscale Funnel (bypasses O001) │
|
||||||
|
│ Vaultwarden: Cloudflare Tunnel (survives O001 failure) │
|
||||||
|
│ │
|
||||||
|
└─────────────────────────────────────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
**Key Changes:**
|
||||||
|
1. Move Vaultwarden to Cloudflare Tunnel (survives O001 outage)
|
||||||
|
2. Jellyfin via Tailscale Funnel (no Cloudflare ToS issues)
|
||||||
|
3. Other services via Cloudflare Tunnel with H001 as backup
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Headscale HA (L001)
|
||||||
|
|
||||||
|
### The Problem
|
||||||
|
|
||||||
|
L001 running Headscale is the MOST CRITICAL SPOF:
|
||||||
|
- If Headscale dies, existing connections keep working temporarily
|
||||||
|
- NO NEW devices can connect
|
||||||
|
- Devices that reboot cannot rejoin the mesh
|
||||||
|
- Eventually all mesh connectivity degrades
|
||||||
|
|
||||||
|
### Solution Options
|
||||||
|
|
||||||
|
#### Option 1: Frequent Backups (Minimum Viable)
|
||||||
|
|
||||||
|
```nix
|
||||||
|
my.backup = {
|
||||||
|
enable = true;
|
||||||
|
paths = [ "/var/lib/headscale" "/var/lib/acme" ];
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
**Recovery time:** ~30 minutes to spin up new VPS + restore
|
||||||
|
|
||||||
|
#### Option 2: Warm Standby
|
||||||
|
|
||||||
|
- Run second Linode/VPS with Headscale configured but stopped
|
||||||
|
- Daily rsync of `/var/lib/headscale/` to standby
|
||||||
|
- Update DNS to point to standby if primary fails
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Daily sync to standby
|
||||||
|
rsync -avz l001:/var/lib/headscale/ standby:/var/lib/headscale/
|
||||||
|
```
|
||||||
|
|
||||||
|
**Recovery time:** ~5 minutes (start service, update DNS)
|
||||||
|
|
||||||
|
#### Option 3: Headscale HA with LiteFS
|
||||||
|
|
||||||
|
Headscale doesn't natively support HA, but you can use:
|
||||||
|
- **LiteFS** for SQLite replication
|
||||||
|
- **Consul** for leader election and failover
|
||||||
|
|
||||||
|
See: https://gawsoft.com/blog/headscale-litefs-consul-replication-failover/
|
||||||
|
|
||||||
|
**Recovery time:** ~15 seconds automatic failover
|
||||||
|
|
||||||
|
#### Option 4: Use Tailscale Commercial
|
||||||
|
|
||||||
|
Let Tailscale handle the control plane HA:
|
||||||
|
- They manage availability
|
||||||
|
- Keep Headscale for learning/experimentation
|
||||||
|
- Critical services use Tailscale commercial
|
||||||
|
|
||||||
|
### Recommendation
|
||||||
|
|
||||||
|
Start with Option 1 (backups) immediately, work toward Option 2 (warm standby) within a month.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Router HA (H003)
|
||||||
|
|
||||||
|
### The Problem
|
||||||
|
|
||||||
|
H003 is the network gateway:
|
||||||
|
- AdGuard Home (DNS filtering)
|
||||||
|
- dnsmasq (DHCP)
|
||||||
|
- NAT firewall
|
||||||
|
- If it dies, entire LAN loses connectivity
|
||||||
|
|
||||||
|
### Solution Options
|
||||||
|
|
||||||
|
#### Option 1: Secondary DNS/DHCP
|
||||||
|
|
||||||
|
Run backup DNS on another host (H001 or H002):
|
||||||
|
- Secondary AdGuard Home instance
|
||||||
|
- Clients configured with both DNS servers
|
||||||
|
- DHCP failover is trickier (consider ISC DHCP with failover)
|
||||||
|
|
||||||
|
#### Option 2: Keepalived for Router Failover
|
||||||
|
|
||||||
|
If you have two devices that could be routers:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
services.keepalived = {
|
||||||
|
enable = true;
|
||||||
|
vrrpInstances.router = {
|
||||||
|
state = "MASTER"; # or "BACKUP" on secondary
|
||||||
|
interface = "eth0";
|
||||||
|
virtualRouterId = 1;
|
||||||
|
priority = 255; # Lower on backup
|
||||||
|
virtualIps = [{ addr = "10.12.14.1/24"; }];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Option 3: Router Redundancy via ISP
|
||||||
|
|
||||||
|
- Use ISP router as fallback gateway
|
||||||
|
- Clients get two gateways via DHCP
|
||||||
|
- Less control but automatic failover
|
||||||
|
|
||||||
|
### Recommendation
|
||||||
|
|
||||||
|
Run secondary AdGuard Home on H001/H002 as minimum redundancy. Full router HA is complex for homelab.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## NFS HA (H002)
|
||||||
|
|
||||||
|
### Current State
|
||||||
|
|
||||||
|
H002 uses bcachefs with 2x replication across 5 disks. Single host failure still causes data unavailability.
|
||||||
|
|
||||||
|
### Options
|
||||||
|
|
||||||
|
#### Option 1: NFS Client Resilience
|
||||||
|
|
||||||
|
Configure NFS clients to handle server unavailability gracefully:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
fileSystems."/nfs/h002" = {
|
||||||
|
device = "100.64.0.3:/data";
|
||||||
|
fsType = "nfs4";
|
||||||
|
options = [
|
||||||
|
"soft" # Don't hang forever
|
||||||
|
"timeo=50" # 5 second timeout
|
||||||
|
"retrans=3" # 3 retries
|
||||||
|
"nofail" # Don't fail boot if unavailable
|
||||||
|
];
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Option 2: Second NAS with GlusterFS
|
||||||
|
|
||||||
|
For true HA, run two NAS nodes with GlusterFS replication:
|
||||||
|
|
||||||
|
```
|
||||||
|
H002 (bcachefs) ◄──── GlusterFS ────► H00X (bcachefs)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Overkill for homelab**, but an option for critical data.
|
||||||
|
|
||||||
|
### Recommendation
|
||||||
|
|
||||||
|
Current bcachefs replication is adequate. Focus on offsite backups for truly irreplaceable data.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Recommended Implementation Order
|
||||||
|
|
||||||
|
### Phase 1: Quick Wins (This Week)
|
||||||
|
1. [ ] Set up Cloudflare Tunnel on O001 AND H001
|
||||||
|
2. [ ] Enable Tailscale Funnel for Jellyfin
|
||||||
|
3. [ ] Automated backups for L001 Headscale
|
||||||
|
|
||||||
|
### Phase 2: Core Resilience (This Month)
|
||||||
|
4. [ ] DNS-01 ACME for shared certs
|
||||||
|
5. [ ] Warm standby for Headscale
|
||||||
|
6. [ ] Secondary AdGuard Home
|
||||||
|
|
||||||
|
### Phase 3: Full Resilience (Next Quarter)
|
||||||
|
7. [ ] Headscale HA with LiteFS (if needed)
|
||||||
|
8. [ ] Automated failover testing
|
||||||
|
9. [ ] Runbook documentation
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Monitoring & Alerting
|
||||||
|
|
||||||
|
Essential for knowing when to failover:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
# Uptime monitoring for critical services
|
||||||
|
services.uptime-kuma = {
|
||||||
|
enable = true;
|
||||||
|
# Monitor: Headscale, nginx, Vaultwarden, AdGuard
|
||||||
|
};
|
||||||
|
|
||||||
|
# Or use external monitoring (BetterStack, Uptime Robot)
|
||||||
|
```
|
||||||
|
|
||||||
|
Alert on:
|
||||||
|
- Headscale API unreachable
|
||||||
|
- nginx health check fails
|
||||||
|
- DNS resolution fails
|
||||||
|
- NFS mount fails
|
||||||
340
ideas/service_backups.md
Normal file
340
ideas/service_backups.md
Normal file
|
|
@ -0,0 +1,340 @@
|
||||||
|
# Service Backup Strategy
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
This document outlines the backup strategy for the NixOS fleet, covering critical data paths, backup tools, and recovery procedures.
|
||||||
|
|
||||||
|
## Current State
|
||||||
|
|
||||||
|
**No automated backups are running today.** This is a critical gap.
|
||||||
|
|
||||||
|
## Backup Topology
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────────────────────────────────────────────────┐
|
||||||
|
│ BACKUP TOPOLOGY │
|
||||||
|
├─────────────────────────────────────────────────────────────┤
|
||||||
|
│ │
|
||||||
|
│ H001,H003,O001,L001 ──────► H002:/data/backups (primary) │
|
||||||
|
│ └────► B2/S3 (offsite) │
|
||||||
|
│ │
|
||||||
|
│ H002 (NAS) ───────────────► B2/S3 (offsite only) │
|
||||||
|
│ │
|
||||||
|
└─────────────────────────────────────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
## Critical Paths by Host
|
||||||
|
|
||||||
|
### L001 (Headscale) - HIGHEST PRIORITY
|
||||||
|
|
||||||
|
| Path | Description | Size | Priority |
|
||||||
|
|------|-------------|------|----------|
|
||||||
|
| `/var/lib/headscale/` | SQLite DB with all node registrations | Small | CRITICAL |
|
||||||
|
| `/var/lib/acme/` | SSL certificates | Small | High |
|
||||||
|
|
||||||
|
**Impact if lost:** ALL mesh connectivity fails - new connections fail, devices can't rejoin.
|
||||||
|
|
||||||
|
### O001 (Oracle Gateway)
|
||||||
|
|
||||||
|
| Path | Description | Size | Priority |
|
||||||
|
|------|-------------|------|----------|
|
||||||
|
| `/var/lib/vaultwarden/` | Password vault (encrypted) | ~41MB | CRITICAL |
|
||||||
|
| `/var/lib/postgresql/` | Atuin shell history | ~226MB | Medium |
|
||||||
|
| `/var/lib/acme/` | SSL certificates | Small | High |
|
||||||
|
|
||||||
|
**Impact if lost:** All public access down, password manager lost.
|
||||||
|
|
||||||
|
### H001 (Services)
|
||||||
|
|
||||||
|
| Path | Description | Size | Priority |
|
||||||
|
|------|-------------|------|----------|
|
||||||
|
| `/var/lib/forgejo/` | Git repos + PostgreSQL | Large | CRITICAL |
|
||||||
|
| `/var/lib/zitadel/` | SSO database + config | Medium | CRITICAL |
|
||||||
|
| `/var/lib/openbao/` | Secrets vault | Small | CRITICAL |
|
||||||
|
| `/bao-keys/` | Vault unseal keys | Tiny | CRITICAL |
|
||||||
|
| `/var/lib/trilium/` | Notes database | Medium | High |
|
||||||
|
| `/var/lib/opengist/` | Gist data | Small | Medium |
|
||||||
|
| `/var/lib/open-webui/` | AI chat history | Medium | Low |
|
||||||
|
| `/var/lib/n8n/` | Workflows | Medium | Medium |
|
||||||
|
| `/var/lib/acme/` | SSL certificates | Small | High |
|
||||||
|
| `/var/lib/nixarr/state/` | Media manager configs | Small | Medium |
|
||||||
|
|
||||||
|
**Note:** A 154GB backup exists at `/var/lib/forgejo.tar.gz` - this is manual and should be automated.
|
||||||
|
|
||||||
|
### H003 (Router)
|
||||||
|
|
||||||
|
| Path | Description | Size | Priority |
|
||||||
|
|------|-------------|------|----------|
|
||||||
|
| `/var/lib/AdGuardHome/` | DNS filtering config + stats | Medium | High |
|
||||||
|
| `/boot/keyfile_nvme0n1p1` | LUKS encryption key | Tiny | CRITICAL |
|
||||||
|
|
||||||
|
**WARNING:** The LUKS keyfile must be stored separately in a secure location (e.g., Vaultwarden).
|
||||||
|
|
||||||
|
### H002 (NAS)
|
||||||
|
|
||||||
|
| Path | Description | Size | Priority |
|
||||||
|
|------|-------------|------|----------|
|
||||||
|
| `/data/nixarr/media/` | Movies, TV, music, books | Very Large | Low (replaceable) |
|
||||||
|
| `/data/pinchflat/` | YouTube downloads | Large | Low |
|
||||||
|
|
||||||
|
**Note:** bcachefs already has 2x replication. Offsite backup is optional but recommended for irreplaceable data.
|
||||||
|
|
||||||
|
## Recommended Backup Tool: Restic
|
||||||
|
|
||||||
|
### Why Restic?
|
||||||
|
|
||||||
|
- Modern, encrypted, deduplicated backups
|
||||||
|
- Native NixOS module: `services.restic.backups`
|
||||||
|
- Multiple backend support (local, S3, B2, SFTP)
|
||||||
|
- Incremental backups with deduplication
|
||||||
|
- Easy pruning/retention policies
|
||||||
|
|
||||||
|
### Shared Backup Module
|
||||||
|
|
||||||
|
Create a shared module at `modules/backup.nix`:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
{ config, lib, pkgs, ... }:
|
||||||
|
|
||||||
|
with lib;
|
||||||
|
let
|
||||||
|
cfg = config.my.backup;
|
||||||
|
in {
|
||||||
|
options.my.backup = {
|
||||||
|
enable = mkEnableOption "restic backups";
|
||||||
|
paths = mkOption { type = types.listOf types.str; default = []; };
|
||||||
|
exclude = mkOption { type = types.listOf types.str; default = []; };
|
||||||
|
postgresBackup = mkOption { type = types.bool; default = false; };
|
||||||
|
};
|
||||||
|
|
||||||
|
config = mkIf cfg.enable {
|
||||||
|
# PostgreSQL dumps before backup
|
||||||
|
services.postgresqlBackup = mkIf cfg.postgresBackup {
|
||||||
|
enable = true;
|
||||||
|
location = "/var/backup/postgresql";
|
||||||
|
compression = "zstd";
|
||||||
|
startAt = "02:00:00";
|
||||||
|
};
|
||||||
|
|
||||||
|
services.restic.backups = {
|
||||||
|
daily = {
|
||||||
|
paths = cfg.paths ++ (optional cfg.postgresBackup "/var/backup/postgresql");
|
||||||
|
exclude = cfg.exclude ++ [
|
||||||
|
"**/cache/**"
|
||||||
|
"**/Cache/**"
|
||||||
|
"**/.cache/**"
|
||||||
|
"**/tmp/**"
|
||||||
|
];
|
||||||
|
|
||||||
|
# Primary: NFS to H002
|
||||||
|
repository = "/nfs/h002/backups/${config.networking.hostName}";
|
||||||
|
|
||||||
|
passwordFile = config.age.secrets.restic-password.path;
|
||||||
|
initialize = true;
|
||||||
|
|
||||||
|
pruneOpts = [
|
||||||
|
"--keep-daily 7"
|
||||||
|
"--keep-weekly 4"
|
||||||
|
"--keep-monthly 6"
|
||||||
|
];
|
||||||
|
|
||||||
|
timerConfig = {
|
||||||
|
OnCalendar = "03:00:00";
|
||||||
|
RandomizedDelaySec = "1h";
|
||||||
|
Persistent = true;
|
||||||
|
};
|
||||||
|
|
||||||
|
backupPrepareCommand = ''
|
||||||
|
# Ensure NFS is mounted
|
||||||
|
mount | grep -q "/nfs/h002" || mount /nfs/h002
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
|
||||||
|
# Offsite to B2/S3 (less frequent)
|
||||||
|
offsite = {
|
||||||
|
paths = cfg.paths;
|
||||||
|
repository = "b2:joshuabell-backups:${config.networking.hostName}";
|
||||||
|
passwordFile = config.age.secrets.restic-password.path;
|
||||||
|
environmentFile = config.age.secrets.b2-credentials.path;
|
||||||
|
|
||||||
|
pruneOpts = [
|
||||||
|
"--keep-daily 3"
|
||||||
|
"--keep-weekly 2"
|
||||||
|
"--keep-monthly 3"
|
||||||
|
];
|
||||||
|
|
||||||
|
timerConfig = {
|
||||||
|
OnCalendar = "weekly";
|
||||||
|
Persistent = true;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Per-Host Configuration
|
||||||
|
|
||||||
|
#### L001 (Headscale)
|
||||||
|
```nix
|
||||||
|
my.backup = {
|
||||||
|
enable = true;
|
||||||
|
paths = [ "/var/lib/headscale" "/var/lib/acme" ];
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
#### O001 (Oracle)
|
||||||
|
```nix
|
||||||
|
my.backup = {
|
||||||
|
enable = true;
|
||||||
|
paths = [ "/var/lib/vaultwarden" "/var/lib/acme" ];
|
||||||
|
postgresBackup = true; # For Atuin
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
#### H001 (Services)
|
||||||
|
```nix
|
||||||
|
my.backup = {
|
||||||
|
enable = true;
|
||||||
|
paths = [
|
||||||
|
"/var/lib/forgejo"
|
||||||
|
"/var/lib/zitadel"
|
||||||
|
"/var/lib/openbao"
|
||||||
|
"/bao-keys"
|
||||||
|
"/var/lib/trilium"
|
||||||
|
"/var/lib/opengist"
|
||||||
|
"/var/lib/open-webui"
|
||||||
|
"/var/lib/n8n"
|
||||||
|
"/var/lib/acme"
|
||||||
|
"/var/lib/nixarr/state"
|
||||||
|
];
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
#### H003 (Router)
|
||||||
|
```nix
|
||||||
|
my.backup = {
|
||||||
|
enable = true;
|
||||||
|
paths = [ "/var/lib/AdGuardHome" ];
|
||||||
|
# LUKS key backed up separately to Vaultwarden
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
## Database Backup Best Practices
|
||||||
|
|
||||||
|
### For Containerized PostgreSQL (Forgejo/Zitadel)
|
||||||
|
|
||||||
|
```nix
|
||||||
|
systemd.services.container-forgejo-backup = {
|
||||||
|
script = ''
|
||||||
|
nixos-container run forgejo -- pg_dumpall -U forgejo \
|
||||||
|
| ${pkgs.zstd}/bin/zstd > /var/lib/forgejo/backups/db-$(date +%Y%m%d).sql.zst
|
||||||
|
'';
|
||||||
|
startAt = "02:30:00"; # Before restic runs at 03:00
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
### For Direct PostgreSQL
|
||||||
|
|
||||||
|
```nix
|
||||||
|
services.postgresqlBackup = {
|
||||||
|
enable = true;
|
||||||
|
backupAll = true;
|
||||||
|
location = "/var/backup/postgresql";
|
||||||
|
compression = "zstd";
|
||||||
|
startAt = "*-*-* 02:00:00";
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
## Recovery Procedures
|
||||||
|
|
||||||
|
### Restoring from Restic
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# List snapshots
|
||||||
|
restic -r /path/to/repo snapshots
|
||||||
|
|
||||||
|
# Restore specific snapshot
|
||||||
|
restic -r /path/to/repo restore abc123 --target /restore
|
||||||
|
|
||||||
|
# Restore latest
|
||||||
|
restic -r /path/to/repo restore latest --target /restore
|
||||||
|
|
||||||
|
# Restore specific path
|
||||||
|
restic -r /path/to/repo restore latest \
|
||||||
|
--target /restore \
|
||||||
|
--include /var/lib/postgresql
|
||||||
|
|
||||||
|
# Mount for browsing
|
||||||
|
mkdir /mnt/restic
|
||||||
|
restic -r /path/to/repo mount /mnt/restic
|
||||||
|
```
|
||||||
|
|
||||||
|
### PostgreSQL Recovery
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Stop PostgreSQL
|
||||||
|
systemctl stop postgresql
|
||||||
|
|
||||||
|
# Restore from restic
|
||||||
|
restic restore latest --target / --include /var/lib/postgresql
|
||||||
|
|
||||||
|
# Or from SQL dump
|
||||||
|
sudo -u postgres psql < /restore/all-databases.sql
|
||||||
|
|
||||||
|
# Start PostgreSQL
|
||||||
|
systemctl start postgresql
|
||||||
|
```
|
||||||
|
|
||||||
|
## Backup Verification
|
||||||
|
|
||||||
|
Add automated verification:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
systemd.timers.restic-verify = {
|
||||||
|
wantedBy = [ "timers.target" ];
|
||||||
|
timerConfig = {
|
||||||
|
OnCalendar = "weekly";
|
||||||
|
Persistent = true;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
systemd.services.restic-verify = {
|
||||||
|
script = ''
|
||||||
|
${pkgs.restic}/bin/restic -r /path/to/repo check --read-data-subset=5%
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
## Monitoring & Alerting
|
||||||
|
|
||||||
|
```nix
|
||||||
|
# Alert on backup failure
|
||||||
|
systemd.services."restic-backups-daily".serviceConfig.OnFailure = "notify-failure@%n.service";
|
||||||
|
|
||||||
|
systemd.services."notify-failure@" = {
|
||||||
|
serviceConfig.Type = "oneshot";
|
||||||
|
script = ''
|
||||||
|
${pkgs.curl}/bin/curl -X POST https://ntfy.sh/joshuabell-backups \
|
||||||
|
-H "Title: Backup Failed" \
|
||||||
|
-d "Service: %i on ${config.networking.hostName}"
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
## Action Items
|
||||||
|
|
||||||
|
### Immediate (This Week)
|
||||||
|
- [ ] Set up restic backups for L001 (Headscale) - most critical
|
||||||
|
- [ ] Back up H003's LUKS keyfile to Vaultwarden
|
||||||
|
- [ ] Create `/data/backups/` directory on H002
|
||||||
|
|
||||||
|
### Short-Term (This Month)
|
||||||
|
- [ ] Implement shared backup module
|
||||||
|
- [ ] Deploy to all hosts
|
||||||
|
- [ ] Set up offsite B2 bucket
|
||||||
|
|
||||||
|
### Medium-Term
|
||||||
|
- [ ] Automated backup verification
|
||||||
|
- [ ] Monitoring/alerting integration
|
||||||
|
- [ ] Test recovery procedures
|
||||||
Loading…
Add table
Add a link
Reference in a new issue