- app/main.py : dashboard /, partials /partials/{jobs,monitor} (htmx polling)
- app/templates/ : index, jobs table, monitor card par worker
- app/static/style.css : thème sombre cohérent
- scripts/ingest.py : scan SSD d'acquisition, EXIF CreateDate → segments
continus par (AUV, GoPro serial) avec seuil configurable
- scripts/dispatcher.py : polling queue, pick worker selon VRAM free,
extraction ffmpeg + lingbot-map windowed --offload_to_cpu, progression DB
- DB : SQLite (acquisitions + jobs), lifecycle queued→extracting→running→done
- Workers par défaut : .87 (3060 12GB) + .84 (3090 24GB)
Contexte : QC terrain le jour-même (avant photogrammétrie à 30 jours),
plusieurs heures × 2 GoPros × 2-3 AUVs d'enregistrement à traiter en parallèle.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
169 lines
6.0 KiB
Python
169 lines
6.0 KiB
Python
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import json
|
|
import os
|
|
import sqlite3
|
|
from contextlib import asynccontextmanager, closing
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
from fastapi import FastAPI, Form, HTTPException, Request
|
|
from fastapi.responses import HTMLResponse, JSONResponse
|
|
from fastapi.staticfiles import StaticFiles
|
|
from fastapi.templating import Jinja2Templates
|
|
|
|
DB_PATH = Path(os.environ.get("COSMA_QC_DB", "/var/lib/cosma-qc/jobs.db"))
|
|
WORKERS = json.loads(os.environ.get("COSMA_QC_WORKERS", json.dumps([
|
|
{"host": "192.168.0.87", "ssh_alias": "gpu", "gpu": "RTX 3060 12GB"},
|
|
{"host": "192.168.0.84", "ssh_alias": "cosma-vm","gpu": "RTX 3090 24GB"},
|
|
])))
|
|
|
|
STATUSES = ("queued", "extracting", "running", "done", "error")
|
|
|
|
|
|
def db() -> sqlite3.Connection:
|
|
DB_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|
conn = sqlite3.connect(DB_PATH, isolation_level=None)
|
|
conn.execute("PRAGMA journal_mode=WAL")
|
|
conn.execute("PRAGMA foreign_keys=ON")
|
|
conn.row_factory = sqlite3.Row
|
|
return conn
|
|
|
|
|
|
def init_schema() -> None:
|
|
with closing(db()) as conn:
|
|
conn.executescript("""
|
|
CREATE TABLE IF NOT EXISTS acquisitions (
|
|
id INTEGER PRIMARY KEY,
|
|
name TEXT NOT NULL,
|
|
source_path TEXT NOT NULL,
|
|
created_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS jobs (
|
|
id INTEGER PRIMARY KEY,
|
|
acquisition_id INTEGER NOT NULL REFERENCES acquisitions(id) ON DELETE CASCADE,
|
|
auv TEXT NOT NULL,
|
|
gopro_serial TEXT NOT NULL,
|
|
segment_label TEXT NOT NULL,
|
|
video_paths TEXT NOT NULL,
|
|
frame_count INTEGER,
|
|
frames_dir TEXT,
|
|
status TEXT NOT NULL DEFAULT 'queued',
|
|
worker_host TEXT,
|
|
viser_url TEXT,
|
|
ply_path TEXT,
|
|
progress INTEGER NOT NULL DEFAULT 0,
|
|
log_tail TEXT,
|
|
error TEXT,
|
|
started_at TEXT,
|
|
finished_at TEXT,
|
|
created_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
);
|
|
|
|
CREATE INDEX IF NOT EXISTS jobs_status_idx ON jobs(status);
|
|
CREATE INDEX IF NOT EXISTS jobs_acq_idx ON jobs(acquisition_id);
|
|
""")
|
|
|
|
|
|
@asynccontextmanager
|
|
async def lifespan(_: FastAPI):
|
|
init_schema()
|
|
yield
|
|
|
|
|
|
app = FastAPI(title="cosma-qc", lifespan=lifespan)
|
|
templates = Jinja2Templates(directory=Path(__file__).parent / "templates")
|
|
app.mount("/static", StaticFiles(directory=Path(__file__).parent / "static"), name="static")
|
|
|
|
|
|
@app.get("/", response_class=HTMLResponse)
|
|
async def index(request: Request):
|
|
with closing(db()) as conn:
|
|
jobs = conn.execute("""
|
|
SELECT j.*, a.name AS acquisition_name
|
|
FROM jobs j
|
|
LEFT JOIN acquisitions a ON a.id = j.acquisition_id
|
|
ORDER BY j.created_at DESC
|
|
LIMIT 200
|
|
""").fetchall()
|
|
return templates.TemplateResponse("index.html", {
|
|
"request": request,
|
|
"jobs": jobs,
|
|
"workers": WORKERS,
|
|
})
|
|
|
|
|
|
@app.get("/api/jobs")
|
|
async def list_jobs():
|
|
with closing(db()) as conn:
|
|
rows = conn.execute("SELECT * FROM jobs ORDER BY created_at DESC LIMIT 500").fetchall()
|
|
return [dict(r) for r in rows]
|
|
|
|
|
|
@app.get("/partials/jobs", response_class=HTMLResponse)
|
|
async def partial_jobs(request: Request):
|
|
with closing(db()) as conn:
|
|
jobs = conn.execute("""
|
|
SELECT j.*, a.name AS acquisition_name
|
|
FROM jobs j
|
|
LEFT JOIN acquisitions a ON a.id = j.acquisition_id
|
|
ORDER BY j.created_at DESC
|
|
LIMIT 200
|
|
""").fetchall()
|
|
return templates.TemplateResponse("_jobs_table.html", {"request": request, "jobs": jobs})
|
|
|
|
|
|
@app.get("/partials/monitor", response_class=HTMLResponse)
|
|
async def partial_monitor(request: Request):
|
|
stats = await asyncio.gather(*[_worker_stats(w) for w in WORKERS])
|
|
return templates.TemplateResponse("_monitor.html", {"request": request, "workers": stats})
|
|
|
|
|
|
async def _worker_stats(worker: dict) -> dict:
|
|
alias = worker["ssh_alias"]
|
|
try:
|
|
proc = await asyncio.create_subprocess_exec(
|
|
"ssh", "-o", "ConnectTimeout=3", "-o", "BatchMode=yes", alias,
|
|
"nvidia-smi --query-gpu=memory.used,memory.total,utilization.gpu --format=csv,noheader,nounits && df -h / | tail -1",
|
|
stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE,
|
|
)
|
|
out, _ = await asyncio.wait_for(proc.communicate(), timeout=4)
|
|
text = out.decode().strip().splitlines()
|
|
gpu_line = text[0].split(",") if text else ["?", "?", "?"]
|
|
disk = text[1].split() if len(text) > 1 else ["?"] * 6
|
|
return {
|
|
**worker,
|
|
"online": True,
|
|
"vram_used_mib": int(gpu_line[0].strip()) if gpu_line[0].strip().isdigit() else None,
|
|
"vram_total_mib": int(gpu_line[1].strip()) if gpu_line[1].strip().isdigit() else None,
|
|
"gpu_util_pct": int(gpu_line[2].strip()) if gpu_line[2].strip().isdigit() else None,
|
|
"disk_used_pct": disk[4] if len(disk) > 4 else "?",
|
|
}
|
|
except Exception as e:
|
|
return {**worker, "online": False, "error": str(e)[:80]}
|
|
|
|
|
|
@app.post("/jobs/{job_id}/cancel")
|
|
async def cancel_job(job_id: int):
|
|
with closing(db()) as conn:
|
|
conn.execute(
|
|
"UPDATE jobs SET status='error', error='cancelled by user', finished_at=datetime('now') "
|
|
"WHERE id=? AND status IN ('queued','extracting','running')",
|
|
(job_id,),
|
|
)
|
|
return {"ok": True}
|
|
|
|
|
|
@app.post("/jobs/{job_id}/retry")
|
|
async def retry_job(job_id: int):
|
|
with closing(db()) as conn:
|
|
conn.execute(
|
|
"UPDATE jobs SET status='queued', error=NULL, progress=0, started_at=NULL, "
|
|
"finished_at=NULL, worker_host=NULL WHERE id=? AND status='error'",
|
|
(job_id,),
|
|
)
|
|
return {"ok": True}
|