feat: pipeline monitor + orchestrator stats dashboard
This commit is contained in:
78
app/main.py
78
app/main.py
@@ -52,6 +52,7 @@ from fastapi.staticfiles import StaticFiles
|
|||||||
from fastapi.templating import Jinja2Templates
|
from fastapi.templating import Jinja2Templates
|
||||||
|
|
||||||
DB_PATH = Path(os.environ.get("COSMA_QC_DB", "/var/lib/cosma-qc/jobs.db"))
|
DB_PATH = Path(os.environ.get("COSMA_QC_DB", "/var/lib/cosma-qc/jobs.db"))
|
||||||
|
PIPELINE_DB = Path("/cosma-pipeline/state.db")
|
||||||
WORKERS = json.loads(os.environ.get("COSMA_QC_WORKERS", json.dumps([
|
WORKERS = json.loads(os.environ.get("COSMA_QC_WORKERS", json.dumps([
|
||||||
{"host": "192.168.0.87", "ssh_alias": "gpu", "gpu": "RTX 3060 12GB"},
|
{"host": "192.168.0.87", "ssh_alias": "gpu", "gpu": "RTX 3060 12GB"},
|
||||||
{"host": "192.168.0.84", "ssh_alias": "cosma-vm","gpu": "RTX 3090 24GB"},
|
{"host": "192.168.0.84", "ssh_alias": "cosma-vm","gpu": "RTX 3090 24GB"},
|
||||||
@@ -295,12 +296,58 @@ async def partial_jobs(request: Request):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/partials/pipeline", response_class=HTMLResponse)
|
||||||
|
async def partial_pipeline(request: Request):
|
||||||
|
data = {"missions": [], "error": None}
|
||||||
|
if not PIPELINE_DB.exists():
|
||||||
|
data["error"] = f"{PIPELINE_DB} introuvable"
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
import shutil, tempfile
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
|
||||||
|
tmp_path = tmp.name
|
||||||
|
shutil.copy2(str(PIPELINE_DB), tmp_path)
|
||||||
|
with sqlite3.connect(tmp_path) as conn:
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
missions = conn.execute(
|
||||||
|
"SELECT * FROM missions ORDER BY created_at DESC LIMIT 20"
|
||||||
|
).fetchall()
|
||||||
|
for m in missions:
|
||||||
|
jobs = conn.execute(
|
||||||
|
"SELECT * FROM jobs WHERE mission_id=? ORDER BY stage, auv_id",
|
||||||
|
(m["id"],)
|
||||||
|
).fetchall()
|
||||||
|
counts = {}
|
||||||
|
for j in jobs:
|
||||||
|
counts[j["status"]] = counts.get(j["status"], 0) + 1
|
||||||
|
data["missions"].append({
|
||||||
|
"id": m["id"],
|
||||||
|
"name": m["name"],
|
||||||
|
"status": m["status"],
|
||||||
|
"jobs": [dict(j) for j in jobs],
|
||||||
|
"counts": counts,
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
data["error"] = str(e)[:200]
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
import os
|
||||||
|
os.unlink(tmp_path)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return templates.TemplateResponse("_pipeline.html", {"request": request, **data})
|
||||||
|
|
||||||
|
|
||||||
@app.get("/partials/monitor", response_class=HTMLResponse)
|
@app.get("/partials/monitor", response_class=HTMLResponse)
|
||||||
async def partial_monitor(request: Request):
|
async def partial_monitor(request: Request):
|
||||||
stats = await asyncio.gather(*[_worker_stats(w) for w in WORKERS])
|
stats, orch = await asyncio.gather(
|
||||||
|
asyncio.gather(*[_worker_stats(w) for w in WORKERS]),
|
||||||
|
_orchestrator_stats(),
|
||||||
|
)
|
||||||
return templates.TemplateResponse("_monitor.html", {
|
return templates.TemplateResponse("_monitor.html", {
|
||||||
"request": request,
|
"request": request,
|
||||||
"workers": stats,
|
"workers": stats,
|
||||||
|
"orchestrator": orch,
|
||||||
"dispatcher": _dispatcher_status(),
|
"dispatcher": _dispatcher_status(),
|
||||||
})
|
})
|
||||||
|
|
||||||
@@ -353,6 +400,35 @@ async def _worker_stats(worker: dict) -> dict:
|
|||||||
return {**worker, "online": False, "error": str(e)[:80]}
|
return {**worker, "online": False, "error": str(e)[:80]}
|
||||||
|
|
||||||
|
|
||||||
|
async def _orchestrator_stats() -> dict:
|
||||||
|
base = {"host": "192.168.0.83", "role": "orchestrateur (.83)", "cpu": None, "ram_used_pct": None,
|
||||||
|
"ram_total_mib": None, "ssd_used_pct": None, "ssd_avail": None, "online": False}
|
||||||
|
try:
|
||||||
|
cmd = (
|
||||||
|
r"uptime | grep -oP 'load average: \K[\d., ]+' ; "
|
||||||
|
"free -m | awk '/^Mem:/{print $2,$3}' ; "
|
||||||
|
"df -h /mnt/ssd 2>/dev/null | tail -1 || echo '- - - - - -'"
|
||||||
|
)
|
||||||
|
proc = await asyncio.create_subprocess_exec(
|
||||||
|
"ssh", "-o", "ConnectTimeout=3", "-o", "BatchMode=yes", "cosma-self", cmd,
|
||||||
|
stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE,
|
||||||
|
)
|
||||||
|
out, _ = await asyncio.wait_for(proc.communicate(), timeout=5)
|
||||||
|
lines = out.decode().strip().splitlines()
|
||||||
|
load = lines[0].strip() if lines else "?"
|
||||||
|
ram = lines[1].split() if len(lines) > 1 else ["?", "?"]
|
||||||
|
disk = lines[2].split() if len(lines) > 2 else ["?"] * 6
|
||||||
|
total_mib = int(ram[0]) if ram[0].isdigit() else None
|
||||||
|
used_mib = int(ram[1]) if len(ram) > 1 and ram[1].isdigit() else None
|
||||||
|
ram_pct = int(used_mib * 100 / total_mib) if total_mib and used_mib else None
|
||||||
|
return {**base, "online": True, "cpu_load": load,
|
||||||
|
"ram_used_pct": ram_pct, "ram_total_mib": total_mib, "ram_used_mib": used_mib,
|
||||||
|
"ssd_used_pct": disk[4] if len(disk) > 4 else "?",
|
||||||
|
"ssd_avail": disk[3] if len(disk) > 3 else "?"}
|
||||||
|
except Exception as e:
|
||||||
|
return {**base, "error": str(e)[:80]}
|
||||||
|
|
||||||
|
|
||||||
@app.post("/jobs/{job_id}/cancel")
|
@app.post("/jobs/{job_id}/cancel")
|
||||||
async def cancel_job(job_id: int):
|
async def cancel_job(job_id: int):
|
||||||
with closing(db()) as conn:
|
with closing(db()) as conn:
|
||||||
|
|||||||
@@ -198,3 +198,25 @@ code { background: rgba(255,255,255,0.05); padding: 0 0.25rem; border-radius: 3p
|
|||||||
.viewer-btn { background: #1a3a2a; color: #4ade80; border: 1px solid #4ade80; border-radius: 3px; padding: 2px 8px; cursor: pointer; font-size: 0.8rem; }
|
.viewer-btn { background: #1a3a2a; color: #4ade80; border: 1px solid #4ade80; border-radius: 3px; padding: 2px 8px; cursor: pointer; font-size: 0.8rem; }
|
||||||
.viewer-btn:hover { background: #4ade80; color: #0a1a10; }
|
.viewer-btn:hover { background: #4ade80; color: #0a1a10; }
|
||||||
.viewer-btn:disabled { opacity: 0.5; cursor: wait; }
|
.viewer-btn:disabled { opacity: 0.5; cursor: wait; }
|
||||||
|
|
||||||
|
/* ==== Pipeline section ==== */
|
||||||
|
.pipeline-mission { margin-bottom: 1rem; }
|
||||||
|
.pm-header { display: flex; align-items: center; gap: 0.75rem; margin-bottom: 0.4rem; flex-wrap: wrap; }
|
||||||
|
.pm-name { font-weight: 600; color: var(--accent); }
|
||||||
|
.pm-status { font-size: 0.75rem; padding: 0.1rem 0.4rem; border-radius: 4px; text-transform: uppercase; font-weight: 600; }
|
||||||
|
.pm-counts { display: flex; gap: 0.4rem; flex-wrap: wrap; }
|
||||||
|
.cnt { font-size: 0.72rem; padding: 0.1rem 0.35rem; border-radius: 3px; background: rgba(255,255,255,0.05); }
|
||||||
|
.cnt.ok { color: var(--ok); } .cnt.busy { color: var(--accent); } .cnt.warn { color: var(--warn); } .cnt.err { color: var(--err); }
|
||||||
|
|
||||||
|
.pipeline-jobs-table { width: 100%; border-collapse: collapse; font-size: 0.82rem; }
|
||||||
|
.pipeline-jobs-table th { text-align: left; padding: 3px 8px; color: var(--muted); font-size: 0.70rem; text-transform: uppercase; border-bottom: 1px solid var(--border); }
|
||||||
|
.pipeline-jobs-table td { padding: 4px 8px; border-bottom: 1px solid rgba(255,255,255,0.03); }
|
||||||
|
.pipeline-jobs-table tr.pj-err-row td { padding: 0 8px 4px; }
|
||||||
|
|
||||||
|
.pj-badge { font-size: 0.70rem; padding: 1px 5px; border-radius: 3px; text-transform: uppercase; font-weight: 600; }
|
||||||
|
.status-done, .pj-badge.status-done { color: var(--ok); background: rgba(61,220,132,0.1); }
|
||||||
|
.status-running, .pj-badge.status-running { color: var(--accent); background: rgba(95,208,255,0.1); }
|
||||||
|
.status-queued, .pj-badge.status-queued { color: var(--muted); }
|
||||||
|
.status-degraded, .pj-badge.status-degraded { color: var(--warn); background: rgba(245,197,24,0.1); }
|
||||||
|
.status-error, .pj-badge.status-error { color: var(--err); background: rgba(255,92,122,0.1); }
|
||||||
|
.status-ingested, .pm-status.status-ingested { color: var(--accent); background: rgba(95,208,255,0.12); }
|
||||||
|
|||||||
@@ -12,6 +12,33 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
{% if orchestrator %}
|
||||||
|
<div class="worker {% if not orchestrator.online %}offline{% endif %}" style="margin-bottom:0.75rem">
|
||||||
|
<div class="hdr">
|
||||||
|
<b>{{ orchestrator.role }}</b>
|
||||||
|
<span class="gpu">orchestrateur</span>
|
||||||
|
<span class="state">{% if orchestrator.online %}online{% else %}offline{% endif %}</span>
|
||||||
|
</div>
|
||||||
|
{% if orchestrator.online %}
|
||||||
|
<div class="bar">
|
||||||
|
<span>CPU</span>
|
||||||
|
<span style="font-size:0.8rem;color:var(--accent)">{{ orchestrator.cpu_load or '?' }}</span>
|
||||||
|
</div>
|
||||||
|
<div class="bar">
|
||||||
|
<span>RAM</span>
|
||||||
|
<progress value="{{ orchestrator.ram_used_mib or 0 }}" max="{{ orchestrator.ram_total_mib or 1 }}"></progress>
|
||||||
|
<small>{{ orchestrator.ram_used_mib or '?' }} / {{ orchestrator.ram_total_mib or '?' }} MiB</small>
|
||||||
|
</div>
|
||||||
|
<div class="worker-meta">
|
||||||
|
<span class="tag muted">SSD {{ orchestrator.ssd_avail }} dispo</span>
|
||||||
|
<span class="tag muted">{{ orchestrator.ssd_used_pct }} utilise</span>
|
||||||
|
</div>
|
||||||
|
{% else %}
|
||||||
|
<div class="err">{{ orchestrator.error or "unreachable" }}</div>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
<div class="worker-grid">
|
<div class="worker-grid">
|
||||||
{% for w in workers %}
|
{% for w in workers %}
|
||||||
<div class="worker {% if not w.online %}offline{% endif %}">
|
<div class="worker {% if not w.online %}offline{% endif %}">
|
||||||
|
|||||||
47
app/templates/_pipeline.html
Normal file
47
app/templates/_pipeline.html
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
{% if error %}
|
||||||
|
<p class="err">{{ error }}</p>
|
||||||
|
{% elif not missions %}
|
||||||
|
<p class="muted">Aucune mission dans state.db.</p>
|
||||||
|
{% else %}
|
||||||
|
{% for m in missions %}
|
||||||
|
<div class="pipeline-mission">
|
||||||
|
<div class="pm-header">
|
||||||
|
<span class="pm-name">{{ m.name }}</span>
|
||||||
|
<span class="pm-status status-{{ m.status }}">{{ m.status }}</span>
|
||||||
|
<span class="pm-counts">
|
||||||
|
{% if m.counts.get('done') %}<span class="cnt ok">{{ m.counts.done }} done</span>{% endif %}
|
||||||
|
{% if m.counts.get('running') %}<span class="cnt busy">{{ m.counts.running }} running</span>{% endif %}
|
||||||
|
{% if m.counts.get('queued') %}<span class="cnt muted">{{ m.counts.queued }} queued</span>{% endif %}
|
||||||
|
{% if m.counts.get('degraded') %}<span class="cnt warn">{{ m.counts.degraded }} degraded</span>{% endif %}
|
||||||
|
{% if m.counts.get('error') %}<span class="cnt err">{{ m.counts.error }} error</span>{% endif %}
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<table class="pipeline-jobs-table">
|
||||||
|
<thead>
|
||||||
|
<tr><th>AUV</th><th>Segment</th><th>Stage</th><th>Status</th><th>Worker</th><th>Duree</th></tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{% for j in m.jobs %}
|
||||||
|
<tr class="pj-row status-{{ j.status }}">
|
||||||
|
<td>{{ j.auv_id }}</td>
|
||||||
|
<td class="muted">{{ j.segment_label or '-' }}</td>
|
||||||
|
<td><code>{{ j.stage }}</code></td>
|
||||||
|
<td><span class="pj-badge status-{{ j.status }}">{{ j.status }}</span></td>
|
||||||
|
<td class="muted">{{ j.worker_host or '-' }}</td>
|
||||||
|
<td class="muted">
|
||||||
|
{% if j.started_at and j.finished_at %}
|
||||||
|
{{ j.finished_at[11:16] if j.finished_at else '' }}
|
||||||
|
{% elif j.started_at %}
|
||||||
|
{{ j.started_at[11:16] }} →
|
||||||
|
{% else %}-{% endif %}
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
{% if j.error_msg %}
|
||||||
|
<tr class="pj-err-row"><td colspan="6" class="err" style="font-size:0.72rem;padding:2px 8px">{{ j.error_msg[:120] }}</td></tr>
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
{% endfor %}
|
||||||
|
{% endif %}
|
||||||
@@ -18,6 +18,13 @@
|
|||||||
<p class="muted">Chargement des workers…</p>
|
<p class="muted">Chargement des workers…</p>
|
||||||
</section>
|
</section>
|
||||||
|
|
||||||
|
<section id="pipeline">
|
||||||
|
<h2>Pipeline reconstruction</h2>
|
||||||
|
<div hx-get="/partials/pipeline" hx-trigger="load, every 5s" hx-swap="innerHTML">
|
||||||
|
<p class="muted">Chargement pipeline...</p>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
<section id="jobs">
|
<section id="jobs">
|
||||||
<h2>Jobs</h2>
|
<h2>Jobs</h2>
|
||||||
<div id="jobs-table" hx-get="/partials/jobs" hx-trigger="load, every 3s" hx-swap="innerHTML">
|
<div id="jobs-table" hx-get="/partials/jobs" hx-trigger="load, every 3s" hx-swap="innerHTML">
|
||||||
|
|||||||
@@ -8,6 +8,8 @@ services:
|
|||||||
volumes:
|
volumes:
|
||||||
- /home/cosma/cosma-qc-data:/var/lib/cosma-qc
|
- /home/cosma/cosma-qc-data:/var/lib/cosma-qc
|
||||||
- /home/cosma/.ssh:/ssh-in:ro
|
- /home/cosma/.ssh:/ssh-in:ro
|
||||||
|
- /home/cosma/cosma-pipeline:/cosma-pipeline:ro
|
||||||
|
- /mnt/ssd:/mnt/ssd:ro
|
||||||
environment:
|
environment:
|
||||||
COSMA_QC_WORKERS: |
|
COSMA_QC_WORKERS: |
|
||||||
[
|
[
|
||||||
|
|||||||
Reference in New Issue
Block a user