monitor : temp GPU, conso watts, espace disque, heartbeat dispatcher
- nvidia-smi : +temperature.gpu + power.draw - UI : tags °C / W / espace disque libre - Dispatcher heartbeat toutes les 4s → point vert/rouge en haut du monitor - Fix Docker SSH : copie + chmod 600 au démarrage (Bad owner) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
42
app/main.py
42
app/main.py
@@ -213,7 +213,25 @@ async def partial_jobs(request: Request):
|
||||
@app.get("/partials/monitor", response_class=HTMLResponse)
|
||||
async def partial_monitor(request: Request):
|
||||
stats = await asyncio.gather(*[_worker_stats(w) for w in WORKERS])
|
||||
return templates.TemplateResponse("_monitor.html", {"request": request, "workers": stats})
|
||||
return templates.TemplateResponse("_monitor.html", {
|
||||
"request": request,
|
||||
"workers": stats,
|
||||
"dispatcher": _dispatcher_status(),
|
||||
})
|
||||
|
||||
|
||||
def _dispatcher_status() -> dict:
|
||||
hb = DB_PATH.parent / "dispatcher.heartbeat"
|
||||
try:
|
||||
ts = _parse_ts(hb.read_text().strip())
|
||||
if ts:
|
||||
if ts.tzinfo is None:
|
||||
ts = ts.replace(tzinfo=timezone.utc)
|
||||
age = int((datetime.now(timezone.utc) - ts).total_seconds())
|
||||
return {"alive": age < 30, "age_s": age}
|
||||
except Exception:
|
||||
pass
|
||||
return {"alive": False, "age_s": None}
|
||||
|
||||
|
||||
async def _worker_stats(worker: dict) -> dict:
|
||||
@@ -221,20 +239,30 @@ async def _worker_stats(worker: dict) -> dict:
|
||||
try:
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
"ssh", "-o", "ConnectTimeout=3", "-o", "BatchMode=yes", alias,
|
||||
"nvidia-smi --query-gpu=memory.used,memory.total,utilization.gpu --format=csv,noheader,nounits && df -h / | tail -1",
|
||||
"nvidia-smi --query-gpu=memory.used,memory.total,utilization.gpu,temperature.gpu,power.draw --format=csv,noheader,nounits && df -h / | tail -1",
|
||||
stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
out, _ = await asyncio.wait_for(proc.communicate(), timeout=4)
|
||||
out, _ = await asyncio.wait_for(proc.communicate(), timeout=5)
|
||||
text = out.decode().strip().splitlines()
|
||||
gpu_line = text[0].split(",") if text else ["?", "?", "?"]
|
||||
g = [x.strip() for x in text[0].split(",")] if text else ["?"] * 5
|
||||
disk = text[1].split() if len(text) > 1 else ["?"] * 6
|
||||
|
||||
def _int(v: str):
|
||||
try:
|
||||
return int(float(v))
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
return {
|
||||
**worker,
|
||||
"online": True,
|
||||
"vram_used_mib": int(gpu_line[0].strip()) if gpu_line[0].strip().isdigit() else None,
|
||||
"vram_total_mib": int(gpu_line[1].strip()) if gpu_line[1].strip().isdigit() else None,
|
||||
"gpu_util_pct": int(gpu_line[2].strip()) if gpu_line[2].strip().isdigit() else None,
|
||||
"vram_used_mib": _int(g[0]) if len(g) > 0 else None,
|
||||
"vram_total_mib": _int(g[1]) if len(g) > 1 else None,
|
||||
"gpu_util_pct": _int(g[2]) if len(g) > 2 else None,
|
||||
"gpu_temp_c": _int(g[3]) if len(g) > 3 else None,
|
||||
"gpu_power_w": _int(g[4]) if len(g) > 4 else None,
|
||||
"disk_used_pct": disk[4] if len(disk) > 4 else "?",
|
||||
"disk_avail": disk[3] if len(disk) > 3 else "?",
|
||||
}
|
||||
except Exception as e:
|
||||
return {**worker, "online": False, "error": str(e)[:80]}
|
||||
|
||||
Reference in New Issue
Block a user