diff --git a/Dockerfile b/Dockerfile index ee46bcc..4a9667b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,4 +16,4 @@ ENV COSMA_QC_DB=/var/lib/cosma-qc/jobs.db EXPOSE 8000 -CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] +CMD ["/bin/sh", "-c", "if [ -d /ssh-in ]; then mkdir -p /root/.ssh && cp -r /ssh-in/. /root/.ssh/ && chmod 700 /root/.ssh && chmod 600 /root/.ssh/* 2>/dev/null; fi && uvicorn app.main:app --host 0.0.0.0 --port 8000"] diff --git a/app/main.py b/app/main.py index d253693..a357bc6 100644 --- a/app/main.py +++ b/app/main.py @@ -213,7 +213,25 @@ async def partial_jobs(request: Request): @app.get("/partials/monitor", response_class=HTMLResponse) async def partial_monitor(request: Request): stats = await asyncio.gather(*[_worker_stats(w) for w in WORKERS]) - return templates.TemplateResponse("_monitor.html", {"request": request, "workers": stats}) + return templates.TemplateResponse("_monitor.html", { + "request": request, + "workers": stats, + "dispatcher": _dispatcher_status(), + }) + + +def _dispatcher_status() -> dict: + hb = DB_PATH.parent / "dispatcher.heartbeat" + try: + ts = _parse_ts(hb.read_text().strip()) + if ts: + if ts.tzinfo is None: + ts = ts.replace(tzinfo=timezone.utc) + age = int((datetime.now(timezone.utc) - ts).total_seconds()) + return {"alive": age < 30, "age_s": age} + except Exception: + pass + return {"alive": False, "age_s": None} async def _worker_stats(worker: dict) -> dict: @@ -221,20 +239,30 @@ async def _worker_stats(worker: dict) -> dict: try: proc = await asyncio.create_subprocess_exec( "ssh", "-o", "ConnectTimeout=3", "-o", "BatchMode=yes", alias, - "nvidia-smi --query-gpu=memory.used,memory.total,utilization.gpu --format=csv,noheader,nounits && df -h / | tail -1", + "nvidia-smi --query-gpu=memory.used,memory.total,utilization.gpu,temperature.gpu,power.draw --format=csv,noheader,nounits && df -h / | tail -1", stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) - out, _ = await asyncio.wait_for(proc.communicate(), timeout=4) + out, _ = await asyncio.wait_for(proc.communicate(), timeout=5) text = out.decode().strip().splitlines() - gpu_line = text[0].split(",") if text else ["?", "?", "?"] + g = [x.strip() for x in text[0].split(",")] if text else ["?"] * 5 disk = text[1].split() if len(text) > 1 else ["?"] * 6 + + def _int(v: str): + try: + return int(float(v)) + except Exception: + return None + return { **worker, "online": True, - "vram_used_mib": int(gpu_line[0].strip()) if gpu_line[0].strip().isdigit() else None, - "vram_total_mib": int(gpu_line[1].strip()) if gpu_line[1].strip().isdigit() else None, - "gpu_util_pct": int(gpu_line[2].strip()) if gpu_line[2].strip().isdigit() else None, + "vram_used_mib": _int(g[0]) if len(g) > 0 else None, + "vram_total_mib": _int(g[1]) if len(g) > 1 else None, + "gpu_util_pct": _int(g[2]) if len(g) > 2 else None, + "gpu_temp_c": _int(g[3]) if len(g) > 3 else None, + "gpu_power_w": _int(g[4]) if len(g) > 4 else None, "disk_used_pct": disk[4] if len(disk) > 4 else "?", + "disk_avail": disk[3] if len(disk) > 3 else "?", } except Exception as e: return {**worker, "online": False, "error": str(e)[:80]} diff --git a/app/static/style.css b/app/static/style.css index c8ab06f..d58b6f1 100644 --- a/app/static/style.css +++ b/app/static/style.css @@ -90,6 +90,18 @@ button.mini { padding: 0 0.4rem; font-size: 0.75rem; line-height: 1.4; } @keyframes spin { to { transform: rotate(360deg); } } +.monitor-header { margin-bottom: 0.75rem; } +.dispatcher-status { display: inline-flex; align-items: center; gap: 0.4rem; + font-size: 0.8rem; color: var(--muted); } +.dispatcher-status .dot { width: 8px; height: 8px; border-radius: 50%; + background: var(--err); flex-shrink: 0; } +.dispatcher-status.alive .dot { background: var(--ok); } +.dispatcher-status.alive { color: var(--ok); } +.worker-meta { display: flex; flex-wrap: wrap; gap: 0.3rem; margin-top: 0.35rem; } +.tag { font-size: 0.75rem; padding: 0.1rem 0.35rem; border-radius: 4px; + background: rgba(255,255,255,0.05); color: var(--text); } +.tag.warn { color: var(--warn); } + button { background: transparent; color: var(--accent); border: 1px solid var(--border); padding: 0.2rem 0.6rem; border-radius: 6px; cursor: pointer; font-family: inherit; font-size: 0.75rem; } button:hover { border-color: var(--accent); } diff --git a/app/templates/_monitor.html b/app/templates/_monitor.html index ba35ff1..8208e6a 100644 --- a/app/templates/_monitor.html +++ b/app/templates/_monitor.html @@ -1,3 +1,17 @@ +
+
+ + dispatcher + {% if dispatcher.alive %} + · il y a {{ dispatcher.age_s }}s + {% elif dispatcher.age_s is not none %} + · last seen {{ dispatcher.age_s }}s ago + {% else %} + · non démarré + {% endif %} +
+
+
{% for w in workers %}
@@ -17,7 +31,12 @@ {{ w.gpu_util_pct }}%
-
Disk /{{ w.disk_used_pct }}
+
+ {% if w.gpu_temp_c is not none %}{{ w.gpu_temp_c }}°C{% endif %} + {% if w.gpu_power_w is not none %}{{ w.gpu_power_w }}W{% endif %} + / {{ w.disk_avail }} dispo + {{ w.disk_used_pct }} utilisé +
{% else %}
{{ w.error or "unreachable" }}
{% endif %} diff --git a/scripts/dispatcher.py b/scripts/dispatcher.py index b52ff7a..75eca6d 100644 --- a/scripts/dispatcher.py +++ b/scripts/dispatcher.py @@ -329,9 +329,18 @@ def pop_queued_stitch() -> sqlite3.Row | None: ).fetchone() +def write_heartbeat(): + hb = DB_PATH.parent / "dispatcher.heartbeat" + try: + hb.write_text(_now_iso()) + except Exception: + pass + + def main(): print(f"cosma-qc dispatcher · DB={DB_PATH} · workers={[w['host'] for w in WORKERS]}") while True: + write_heartbeat() job = pop_queued() if job: print(f"→ job #{job['id']} ({job['auv']}/{job['gopro_serial']}/{job['segment_label']})")