Files
zmb-webui/backend/services/zfs_runner.py
T
patrick 3bc57ef36b Fix: ZFS-Erkennung via tatsächlichem zpool-Test statt which()
shutil.which() reicht nicht: auf privilegierten LXC-Containern ist
zpool installiert, aber /dev/zfs fehlt → Befehl schlägt fehl.
_probe_zfs() führt zpool list einmal aus und wertet den Exit-Code aus.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-05 15:15:55 +02:00

569 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
ZFS Command Runner Wrapper für zpool/zfs CLI Commands
Handles subprocess execution, parsing, caching, error handling
"""
import subprocess
import json
import logging
import glob
import os
from typing import Dict, List, Any, Optional, Tuple
from dataclasses import dataclass
from datetime import datetime, timedelta
import re
logger = logging.getLogger(__name__)
# Detect ZFS availability once at import time — avoids repeated ERROR logs on LXC/non-ZFS systems
# shutil.which is not enough: on privileged LXC containers zpool exists but /dev/zfs is missing
def _probe_zfs() -> bool:
try:
r = subprocess.run(["zpool", "list"], capture_output=True, timeout=3)
return r.returncode == 0
except (FileNotFoundError, subprocess.TimeoutExpired, OSError):
return False
ZFS_AVAILABLE = _probe_zfs()
if not ZFS_AVAILABLE:
logger.info("ZFS not available on this system (pools/snapshots disabled)")
# Cache with TTL
@dataclass
class CacheEntry:
data: Any
expires_at: datetime
class ZFSCache:
def __init__(self):
self.pool_status = CacheEntry(None, datetime.now())
self.snapshots = CacheEntry(None, datetime.now())
self.datasets = CacheEntry(None, datetime.now())
def get(self, key: str) -> Optional[Any]:
cache_dict = {
"pool_status": self.pool_status,
"snapshots": self.snapshots,
"datasets": self.datasets,
}
if key not in cache_dict:
return None
entry = cache_dict[key]
if not entry or not entry.data:
return None
if datetime.now() > entry.expires_at:
return None
return entry.data
def set(self, key: str, data: Any, ttl_seconds: int = 60):
cache_dict = {
"pool_status": self.pool_status,
"snapshots": self.snapshots,
"datasets": self.datasets,
}
if key in cache_dict:
cache_dict[key].data = data
cache_dict[key].expires_at = datetime.now() + timedelta(seconds=ttl_seconds)
class ZFSRunner:
def __init__(self, timeout: int = 5):
self.timeout = timeout
self.cache = ZFSCache()
def run_command(self, cmd: List[str], timeout: Optional[int] = None) -> Tuple[str, str, int]:
"""
Run subprocess command with timeout
Returns: (stdout, stderr, returncode)
"""
if timeout is None:
timeout = self.timeout
try:
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=timeout,
check=False
)
return result.stdout, result.stderr, result.returncode
except subprocess.TimeoutExpired:
logger.error(f"Command timeout after {timeout}s: {' '.join(cmd)}")
return "", f"Command timeout after {timeout}s", -1
except FileNotFoundError:
logger.debug(f"Command not found: {' '.join(cmd)}")
return "", f"Command not found: {cmd[0]}", -1
except Exception as e:
logger.error(f"Command execution error: {e}")
return "", str(e), -1
# ============== POOL OPERATIONS ==============
def list_pools(self) -> List[Dict[str, Any]]:
"""
Get list of ZFS pools with status
Uses cache (TTL 30s)
"""
cached = self.cache.get("pool_status")
if cached:
return cached
if not ZFS_AVAILABLE:
return []
stdout, stderr, rc = self.run_command(["zpool", "list", "-H", "-p"])
if rc != 0:
logger.error(f"zpool list failed: {stderr}")
return []
pools = []
for line in stdout.strip().split("\n"):
if not line:
continue
parts = line.split()
if len(parts) < 10:
continue
pool = {
"name": parts[0],
"size": int(parts[1]),
"alloc": int(parts[2]),
"free": int(parts[3]),
"fragmentation": f"{parts[6]}%",
"capacity": f"{parts[7]}%",
"health": parts[9]
}
pools.append(pool)
self.cache.set("pool_status", pools, ttl_seconds=30)
return pools
def _parse_vdev_tree(self, config_lines: List[str]) -> List[Dict[str, Any]]:
"""
Parse VDEV tree from zpool status config section.
Uses indentation levels to reconstruct hierarchy.
Returns list of vdev dicts with name, state, and error counters (read/write/cksum).
"""
roots: List[Dict] = []
stack: List[tuple] = [] # (indent, vdev_dict)
for line in config_lines:
if not line.strip():
continue
# Skip header line (NAME STATE READ WRITE CKSUM)
if line.strip().startswith("NAME"):
continue
indent = len(line) - len(line.lstrip())
parts = line.split()
if not parts:
continue
name = parts[0]
state = parts[1] if len(parts) > 1 else "UNKNOWN"
# Parse error counters and convert to integers
read = 0
write = 0
cksum = 0
if len(parts) > 2:
try:
read = int(parts[2])
except (ValueError, IndexError):
read = 0
if len(parts) > 3:
try:
write = int(parts[3])
except (ValueError, IndexError):
write = 0
if len(parts) > 4:
try:
cksum = int(parts[4])
except (ValueError, IndexError):
cksum = 0
vdev: Dict[str, Any] = {
"name": name,
"state": state,
"read": read,
"write": write,
"cksum": cksum,
"children": []
}
# Pop stack entries that are at same or deeper indent
while stack and stack[-1][0] >= indent:
stack.pop()
if stack:
stack[-1][1]["children"].append(vdev)
else:
roots.append(vdev)
stack.append((indent, vdev))
return roots
def get_disk_id_map(self) -> Dict[str, str]:
"""
Build mapping {sda: ata-WDC_WD20EZRZ-..., nvme0n1: nvme-Samsung_...} from /dev/disk/by-id/.
Prefers ata- prefix, then nvme-, then scsi-, then wwn-.
"""
mapping: Dict[str, str] = {}
priority = ["ata-", "nvme-", "scsi-", "wwn-"]
for pattern in priority:
for link in glob.glob(f"/dev/disk/by-id/{pattern}*"):
if "-part" in os.path.basename(link):
continue
try:
target = os.path.realpath(link)
disk = os.path.basename(target)
if disk not in mapping:
mapping[disk] = os.path.basename(link)
except OSError:
continue
return mapping
def get_smart_info(self, disk: str) -> Dict[str, Any]:
"""
Run smartctl -A -i --json on /dev/disk and return parsed health data.
Returns empty dict if smartctl unavailable or disk not SMART-capable.
"""
stdout, stderr, rc = self.run_command(["smartctl", "-A", "-i", "--json", f"/dev/{disk}"])
if not stdout:
return {}
try:
data = json.loads(stdout)
except json.JSONDecodeError:
return {}
result: Dict[str, Any] = {}
# Device info
device = data.get("device", {})
model_info = data.get("model_name") or data.get("model_family", "")
serial = data.get("serial_number", "")
result["model"] = model_info
result["serial"] = serial
result["protocol"] = device.get("protocol", "")
# Power-on hours
result["power_on_hours"] = data.get("power_on_time", {}).get("hours")
# Temperature
temp = data.get("temperature", {})
result["temperature"] = temp.get("current")
# Overall health (from -H flag data embedded in -i)
smart_status = data.get("smart_status", {})
result["passed"] = smart_status.get("passed")
# Key attributes from ata_smart_attributes
attrs = {a["name"]: a for a in data.get("ata_smart_attributes", {}).get("table", [])}
result["reallocated_sectors"] = attrs.get("Reallocated_Sector_Ct", {}).get("raw", {}).get("value", 0)
result["pending_sectors"] = attrs.get("Current_Pending_Sector", {}).get("raw", {}).get("value", 0)
result["uncorrectable"] = attrs.get("Offline_Uncorrectable", {}).get("raw", {}).get("value", 0)
return result
def get_pool_status(self, pool_name: str) -> Dict[str, Any]:
"""
Get detailed pool status including VDEV tree and error counters
"""
if not ZFS_AVAILABLE:
return {}
stdout, stderr, rc = self.run_command(["zpool", "status", pool_name])
if rc != 0:
logger.error(f"zpool status failed for {pool_name}: {stderr}")
return {}
status: Dict[str, Any] = {
"name": pool_name,
"state": None,
"scan": None,
"errors": None,
"vdevs": [],
}
lines = stdout.split("\n")
in_config = False
config_lines: List[str] = []
for line in lines:
stripped = line.strip()
if stripped.startswith("state:"):
status["state"] = stripped.split(":", 1)[1].strip()
elif stripped.startswith("scan:"):
status["scan"] = stripped.split(":", 1)[1].strip()
elif stripped.startswith("errors:"):
status["errors"] = stripped.split(":", 1)[1].strip()
in_config = False
elif stripped == "config:":
in_config = True
elif in_config:
# Collect config block lines (skip blank lines at start)
if stripped or config_lines:
config_lines.append(line)
if config_lines:
parsed = self._parse_vdev_tree(config_lines)
if parsed and parsed[0]["name"] == pool_name:
status["vdevs"] = parsed[0]["children"]
else:
status["vdevs"] = parsed
# Annotate leaf vdevs with disk_id from /dev/disk/by-id/
disk_id_map = self.get_disk_id_map()
self._annotate_disk_ids(status["vdevs"], disk_id_map)
return status
def _annotate_disk_ids(self, vdevs: List[Dict], disk_id_map: Dict[str, str]) -> None:
"""Recursively annotate leaf vdev nodes with disk_id from by-id map."""
for vdev in vdevs:
children = vdev.get("children", [])
if not children:
name = vdev.get("name", "")
# Strip partition suffix (sda1 → sda)
base = re.sub(r'\d+$', '', name) if name[-1:].isdigit() else name
vdev["disk_id"] = disk_id_map.get(name) or disk_id_map.get(base)
else:
self._annotate_disk_ids(children, disk_id_map)
def scrub_pool(self, pool_name: str) -> Dict[str, str]:
"""
Start or resume scrub on pool
"""
stdout, stderr, rc = self.run_command(["zpool", "scrub", pool_name])
if rc != 0:
logger.error(f"zpool scrub failed for {pool_name}: {stderr}")
return {"status": "error", "message": stderr}
return {"status": "success", "message": f"Scrub started for {pool_name}"}
# ============== DATASET/FILESYSTEM OPERATIONS ==============
def list_datasets(self, pool_name: str, max_depth: int = 2) -> List[Dict[str, Any]]:
"""
List datasets in pool (with depth limit for performance)
"""
cached = self.cache.get("datasets")
if cached and cached.get(pool_name):
return cached[pool_name]
if not ZFS_AVAILABLE:
return []
stdout, stderr, rc = self.run_command([
"zfs", "list", "-d", str(max_depth), "-H", "-p",
"-o", "name,used,avail,refer,mountpoint,type",
pool_name
])
if rc != 0:
logger.error(f"zfs list failed for {pool_name}: {stderr}")
return []
datasets = []
for line in stdout.strip().split("\n"):
if not line:
continue
parts = line.split("\t")
if len(parts) < 6:
continue
dataset = {
"name": parts[0],
"used": int(parts[1]),
"avail": int(parts[2]),
"refer": int(parts[3]),
"mountpoint": parts[4],
"type": parts[5] # filesystem, volume, snapshot
}
datasets.append(dataset)
# Cache per pool
if not cached:
cached = {}
cached[pool_name] = datasets
self.cache.set("datasets", cached, ttl_seconds=60)
return datasets
def create_dataset(self, dataset_name: str, props: Optional[Dict[str, str]] = None) -> Dict[str, str]:
"""
Create new ZFS dataset/filesystem
"""
cmd = ["zfs", "create"]
if props:
for key, val in props.items():
cmd.extend(["-o", f"{key}={val}"])
cmd.append(dataset_name)
stdout, stderr, rc = self.run_command(cmd)
if rc != 0:
logger.error(f"zfs create failed: {stderr}")
return {"status": "error", "message": stderr}
return {"status": "success", "message": f"Dataset {dataset_name} created"}
def set_dataset_properties(self, dataset_name: str, props: Dict[str, str]) -> Dict[str, str]:
"""Set ZFS dataset properties (compression, quota, reservation, etc.)"""
errors = []
for key, value in props.items():
if value is None:
continue
stdout, stderr, rc = self.run_command(["zfs", "set", f"{key}={value}", dataset_name])
if rc != 0:
errors.append(f"{key}: {stderr.strip()}")
if errors:
return {"status": "error", "message": "; ".join(errors)}
return {"status": "success", "message": f"Properties updated for {dataset_name}"}
def destroy_dataset(self, dataset_name: str, recursive: bool = False) -> Dict[str, str]:
"""
Destroy ZFS dataset
"""
cmd = ["zfs", "destroy"]
if recursive:
cmd.append("-r")
cmd.append(dataset_name)
stdout, stderr, rc = self.run_command(cmd)
if rc != 0:
logger.error(f"zfs destroy failed: {stderr}")
return {"status": "error", "message": stderr}
return {"status": "success", "message": f"Dataset {dataset_name} destroyed"}
# ============== SNAPSHOT OPERATIONS ==============
def list_snapshots(self, dataset_name: Optional[str] = None, limit: int = 50) -> List[Dict[str, Any]]:
"""
List snapshots (with limit for performance on many snapshots)
"""
cache_key = f"snapshots:{dataset_name or '*'}"
cached = self.cache.get(cache_key)
if cached:
return cached
if not ZFS_AVAILABLE:
return []
if dataset_name:
# No -d limit so sub-datasets (e.g. tank/share) are included
cmd = ["zfs", "list", "-t", "snapshot", "-r", "-H", "-p",
"-o", "name,used,referenced,creation", dataset_name]
else:
cmd = ["zfs", "list", "-t", "snapshot", "-H", "-p",
"-o", "name,used,referenced,creation"]
stdout, stderr, rc = self.run_command(cmd)
if rc != 0:
logger.error(f"zfs list snapshots failed: {stderr}")
return []
snapshots = []
for line in stdout.strip().split("\n"):
if not line:
continue
parts = line.split("\t")
if len(parts) < 4:
continue
snapshot = {
"name": parts[0],
"used": int(parts[1]),
"referenced": int(parts[2]),
"creation": int(parts[3]), # Unix timestamp
}
snapshots.append(snapshot)
# Sort by creation time (newest first) and limit
snapshots.sort(key=lambda x: x["creation"], reverse=True)
snapshots = snapshots[:limit]
self.cache.set(cache_key, snapshots, ttl_seconds=60)
return snapshots
def create_snapshot(self, dataset_name: str, snapshot_name: Optional[str] = None) -> Dict[str, str]:
"""
Create snapshot with auto-generated name if not provided
"""
if not snapshot_name:
from datetime import datetime as dt
snapshot_name = dt.now().strftime("%Y%m%d-%H%M%S")
full_name = f"{dataset_name}@{snapshot_name}"
stdout, stderr, rc = self.run_command(["zfs", "snapshot", full_name])
if rc != 0:
logger.error(f"zfs snapshot failed: {stderr}")
return {"status": "error", "message": stderr}
return {"status": "success", "message": f"Snapshot {full_name} created"}
def destroy_snapshot(self, snapshot_name: str, recursive: bool = False) -> Dict[str, str]:
"""
Destroy snapshot
"""
cmd = ["zfs", "destroy"]
if recursive:
cmd.append("-r")
cmd.append(snapshot_name)
stdout, stderr, rc = self.run_command(cmd)
if rc != 0:
logger.error(f"zfs destroy snapshot failed: {stderr}")
return {"status": "error", "message": stderr}
return {"status": "success", "message": f"Snapshot {snapshot_name} destroyed"}
def rollback_snapshot(self, snapshot_name: str) -> Dict[str, str]:
"""
Rollback dataset to snapshot
WARNING: Destroys data after snapshot!
"""
stdout, stderr, rc = self.run_command(["zfs", "rollback", "-r", snapshot_name])
if rc != 0:
logger.error(f"zfs rollback failed: {stderr}")
return {"status": "error", "message": stderr}
return {"status": "success", "message": f"Rolled back to {snapshot_name}"}
# ============== UTILITY ==============
def clear_cache(self):
"""Clear all caches"""
self.cache = ZFSCache()
logger.info("ZFS cache cleared")
# Global instance
zfs_runner = ZFSRunner()