monitor : temp GPU, conso watts, espace disque, heartbeat dispatcher
- nvidia-smi : +temperature.gpu + power.draw - UI : tags °C / W / espace disque libre - Dispatcher heartbeat toutes les 4s → point vert/rouge en haut du monitor - Fix Docker SSH : copie + chmod 600 au démarrage (Bad owner) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -16,4 +16,4 @@ ENV COSMA_QC_DB=/var/lib/cosma-qc/jobs.db
|
|||||||
|
|
||||||
EXPOSE 8000
|
EXPOSE 8000
|
||||||
|
|
||||||
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
CMD ["/bin/sh", "-c", "if [ -d /ssh-in ]; then mkdir -p /root/.ssh && cp -r /ssh-in/. /root/.ssh/ && chmod 700 /root/.ssh && chmod 600 /root/.ssh/* 2>/dev/null; fi && uvicorn app.main:app --host 0.0.0.0 --port 8000"]
|
||||||
|
|||||||
42
app/main.py
42
app/main.py
@@ -213,7 +213,25 @@ async def partial_jobs(request: Request):
|
|||||||
@app.get("/partials/monitor", response_class=HTMLResponse)
|
@app.get("/partials/monitor", response_class=HTMLResponse)
|
||||||
async def partial_monitor(request: Request):
|
async def partial_monitor(request: Request):
|
||||||
stats = await asyncio.gather(*[_worker_stats(w) for w in WORKERS])
|
stats = await asyncio.gather(*[_worker_stats(w) for w in WORKERS])
|
||||||
return templates.TemplateResponse("_monitor.html", {"request": request, "workers": stats})
|
return templates.TemplateResponse("_monitor.html", {
|
||||||
|
"request": request,
|
||||||
|
"workers": stats,
|
||||||
|
"dispatcher": _dispatcher_status(),
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
def _dispatcher_status() -> dict:
|
||||||
|
hb = DB_PATH.parent / "dispatcher.heartbeat"
|
||||||
|
try:
|
||||||
|
ts = _parse_ts(hb.read_text().strip())
|
||||||
|
if ts:
|
||||||
|
if ts.tzinfo is None:
|
||||||
|
ts = ts.replace(tzinfo=timezone.utc)
|
||||||
|
age = int((datetime.now(timezone.utc) - ts).total_seconds())
|
||||||
|
return {"alive": age < 30, "age_s": age}
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return {"alive": False, "age_s": None}
|
||||||
|
|
||||||
|
|
||||||
async def _worker_stats(worker: dict) -> dict:
|
async def _worker_stats(worker: dict) -> dict:
|
||||||
@@ -221,20 +239,30 @@ async def _worker_stats(worker: dict) -> dict:
|
|||||||
try:
|
try:
|
||||||
proc = await asyncio.create_subprocess_exec(
|
proc = await asyncio.create_subprocess_exec(
|
||||||
"ssh", "-o", "ConnectTimeout=3", "-o", "BatchMode=yes", alias,
|
"ssh", "-o", "ConnectTimeout=3", "-o", "BatchMode=yes", alias,
|
||||||
"nvidia-smi --query-gpu=memory.used,memory.total,utilization.gpu --format=csv,noheader,nounits && df -h / | tail -1",
|
"nvidia-smi --query-gpu=memory.used,memory.total,utilization.gpu,temperature.gpu,power.draw --format=csv,noheader,nounits && df -h / | tail -1",
|
||||||
stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE,
|
stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE,
|
||||||
)
|
)
|
||||||
out, _ = await asyncio.wait_for(proc.communicate(), timeout=4)
|
out, _ = await asyncio.wait_for(proc.communicate(), timeout=5)
|
||||||
text = out.decode().strip().splitlines()
|
text = out.decode().strip().splitlines()
|
||||||
gpu_line = text[0].split(",") if text else ["?", "?", "?"]
|
g = [x.strip() for x in text[0].split(",")] if text else ["?"] * 5
|
||||||
disk = text[1].split() if len(text) > 1 else ["?"] * 6
|
disk = text[1].split() if len(text) > 1 else ["?"] * 6
|
||||||
|
|
||||||
|
def _int(v: str):
|
||||||
|
try:
|
||||||
|
return int(float(v))
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
return {
|
return {
|
||||||
**worker,
|
**worker,
|
||||||
"online": True,
|
"online": True,
|
||||||
"vram_used_mib": int(gpu_line[0].strip()) if gpu_line[0].strip().isdigit() else None,
|
"vram_used_mib": _int(g[0]) if len(g) > 0 else None,
|
||||||
"vram_total_mib": int(gpu_line[1].strip()) if gpu_line[1].strip().isdigit() else None,
|
"vram_total_mib": _int(g[1]) if len(g) > 1 else None,
|
||||||
"gpu_util_pct": int(gpu_line[2].strip()) if gpu_line[2].strip().isdigit() else None,
|
"gpu_util_pct": _int(g[2]) if len(g) > 2 else None,
|
||||||
|
"gpu_temp_c": _int(g[3]) if len(g) > 3 else None,
|
||||||
|
"gpu_power_w": _int(g[4]) if len(g) > 4 else None,
|
||||||
"disk_used_pct": disk[4] if len(disk) > 4 else "?",
|
"disk_used_pct": disk[4] if len(disk) > 4 else "?",
|
||||||
|
"disk_avail": disk[3] if len(disk) > 3 else "?",
|
||||||
}
|
}
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return {**worker, "online": False, "error": str(e)[:80]}
|
return {**worker, "online": False, "error": str(e)[:80]}
|
||||||
|
|||||||
@@ -90,6 +90,18 @@ button.mini { padding: 0 0.4rem; font-size: 0.75rem; line-height: 1.4; }
|
|||||||
|
|
||||||
@keyframes spin { to { transform: rotate(360deg); } }
|
@keyframes spin { to { transform: rotate(360deg); } }
|
||||||
|
|
||||||
|
.monitor-header { margin-bottom: 0.75rem; }
|
||||||
|
.dispatcher-status { display: inline-flex; align-items: center; gap: 0.4rem;
|
||||||
|
font-size: 0.8rem; color: var(--muted); }
|
||||||
|
.dispatcher-status .dot { width: 8px; height: 8px; border-radius: 50%;
|
||||||
|
background: var(--err); flex-shrink: 0; }
|
||||||
|
.dispatcher-status.alive .dot { background: var(--ok); }
|
||||||
|
.dispatcher-status.alive { color: var(--ok); }
|
||||||
|
.worker-meta { display: flex; flex-wrap: wrap; gap: 0.3rem; margin-top: 0.35rem; }
|
||||||
|
.tag { font-size: 0.75rem; padding: 0.1rem 0.35rem; border-radius: 4px;
|
||||||
|
background: rgba(255,255,255,0.05); color: var(--text); }
|
||||||
|
.tag.warn { color: var(--warn); }
|
||||||
|
|
||||||
button { background: transparent; color: var(--accent); border: 1px solid var(--border);
|
button { background: transparent; color: var(--accent); border: 1px solid var(--border);
|
||||||
padding: 0.2rem 0.6rem; border-radius: 6px; cursor: pointer; font-family: inherit; font-size: 0.75rem; }
|
padding: 0.2rem 0.6rem; border-radius: 6px; cursor: pointer; font-family: inherit; font-size: 0.75rem; }
|
||||||
button:hover { border-color: var(--accent); }
|
button:hover { border-color: var(--accent); }
|
||||||
|
|||||||
@@ -1,3 +1,17 @@
|
|||||||
|
<div class="monitor-header">
|
||||||
|
<div class="dispatcher-status {% if dispatcher.alive %}alive{% else %}dead{% endif %}">
|
||||||
|
<span class="dot"></span>
|
||||||
|
dispatcher
|
||||||
|
{% if dispatcher.alive %}
|
||||||
|
<span class="muted">· il y a {{ dispatcher.age_s }}s</span>
|
||||||
|
{% elif dispatcher.age_s is not none %}
|
||||||
|
<span class="muted">· last seen {{ dispatcher.age_s }}s ago</span>
|
||||||
|
{% else %}
|
||||||
|
<span class="muted">· non démarré</span>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
<div class="worker-grid">
|
<div class="worker-grid">
|
||||||
{% for w in workers %}
|
{% for w in workers %}
|
||||||
<div class="worker {% if not w.online %}offline{% endif %}">
|
<div class="worker {% if not w.online %}offline{% endif %}">
|
||||||
@@ -17,7 +31,12 @@
|
|||||||
<progress value="{{ w.gpu_util_pct or 0 }}" max="100"></progress>
|
<progress value="{{ w.gpu_util_pct or 0 }}" max="100"></progress>
|
||||||
<small>{{ w.gpu_util_pct }}%</small>
|
<small>{{ w.gpu_util_pct }}%</small>
|
||||||
</div>
|
</div>
|
||||||
<div class="bar"><span>Disk /</span><small>{{ w.disk_used_pct }}</small></div>
|
<div class="worker-meta">
|
||||||
|
{% if w.gpu_temp_c is not none %}<span class="tag {% if w.gpu_temp_c > 80 %}warn{% endif %}">{{ w.gpu_temp_c }}°C</span>{% endif %}
|
||||||
|
{% if w.gpu_power_w is not none %}<span class="tag">{{ w.gpu_power_w }}W</span>{% endif %}
|
||||||
|
<span class="tag muted">/ {{ w.disk_avail }} dispo</span>
|
||||||
|
<span class="tag muted">{{ w.disk_used_pct }} utilisé</span>
|
||||||
|
</div>
|
||||||
{% else %}
|
{% else %}
|
||||||
<div class="err">{{ w.error or "unreachable" }}</div>
|
<div class="err">{{ w.error or "unreachable" }}</div>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|||||||
@@ -329,9 +329,18 @@ def pop_queued_stitch() -> sqlite3.Row | None:
|
|||||||
).fetchone()
|
).fetchone()
|
||||||
|
|
||||||
|
|
||||||
|
def write_heartbeat():
|
||||||
|
hb = DB_PATH.parent / "dispatcher.heartbeat"
|
||||||
|
try:
|
||||||
|
hb.write_text(_now_iso())
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
print(f"cosma-qc dispatcher · DB={DB_PATH} · workers={[w['host'] for w in WORKERS]}")
|
print(f"cosma-qc dispatcher · DB={DB_PATH} · workers={[w['host'] for w in WORKERS]}")
|
||||||
while True:
|
while True:
|
||||||
|
write_heartbeat()
|
||||||
job = pop_queued()
|
job = pop_queued()
|
||||||
if job:
|
if job:
|
||||||
print(f"→ job #{job['id']} ({job['auv']}/{job['gopro_serial']}/{job['segment_label']})")
|
print(f"→ job #{job['id']} ({job['auv']}/{job['gopro_serial']}/{job['segment_label']})")
|
||||||
|
|||||||
Reference in New Issue
Block a user