from __future__ import annotations import asyncio import json import os import sqlite3 from contextlib import asynccontextmanager, closing from datetime import datetime from pathlib import Path from typing import Any from fastapi import FastAPI, Form, HTTPException, Request from fastapi.responses import HTMLResponse, JSONResponse from fastapi.staticfiles import StaticFiles from fastapi.templating import Jinja2Templates DB_PATH = Path(os.environ.get("COSMA_QC_DB", "/var/lib/cosma-qc/jobs.db")) WORKERS = json.loads(os.environ.get("COSMA_QC_WORKERS", json.dumps([ {"host": "192.168.0.87", "ssh_alias": "gpu", "gpu": "RTX 3060 12GB"}, {"host": "192.168.0.84", "ssh_alias": "cosma-vm","gpu": "RTX 3090 24GB"}, ]))) STATUSES = ("queued", "extracting", "running", "done", "error") def db() -> sqlite3.Connection: DB_PATH.parent.mkdir(parents=True, exist_ok=True) conn = sqlite3.connect(DB_PATH, isolation_level=None) conn.execute("PRAGMA journal_mode=WAL") conn.execute("PRAGMA foreign_keys=ON") conn.row_factory = sqlite3.Row return conn def init_schema() -> None: with closing(db()) as conn: conn.executescript(""" CREATE TABLE IF NOT EXISTS acquisitions ( id INTEGER PRIMARY KEY, name TEXT NOT NULL, source_path TEXT NOT NULL, created_at TEXT NOT NULL DEFAULT (datetime('now')) ); CREATE TABLE IF NOT EXISTS jobs ( id INTEGER PRIMARY KEY, acquisition_id INTEGER NOT NULL REFERENCES acquisitions(id) ON DELETE CASCADE, auv TEXT NOT NULL, gopro_serial TEXT NOT NULL, segment_label TEXT NOT NULL, video_paths TEXT NOT NULL, frame_count INTEGER, frames_dir TEXT, status TEXT NOT NULL DEFAULT 'queued', worker_host TEXT, viser_url TEXT, ply_path TEXT, progress INTEGER NOT NULL DEFAULT 0, log_tail TEXT, error TEXT, started_at TEXT, finished_at TEXT, created_at TEXT NOT NULL DEFAULT (datetime('now')) ); CREATE INDEX IF NOT EXISTS jobs_status_idx ON jobs(status); CREATE INDEX IF NOT EXISTS jobs_acq_idx ON jobs(acquisition_id); """) @asynccontextmanager async def lifespan(_: FastAPI): init_schema() yield app = FastAPI(title="cosma-qc", lifespan=lifespan) templates = Jinja2Templates(directory=Path(__file__).parent / "templates") app.mount("/static", StaticFiles(directory=Path(__file__).parent / "static"), name="static") @app.get("/", response_class=HTMLResponse) async def index(request: Request): with closing(db()) as conn: jobs = conn.execute(""" SELECT j.*, a.name AS acquisition_name FROM jobs j LEFT JOIN acquisitions a ON a.id = j.acquisition_id ORDER BY j.created_at DESC LIMIT 200 """).fetchall() return templates.TemplateResponse("index.html", { "request": request, "jobs": jobs, "workers": WORKERS, }) @app.get("/api/jobs") async def list_jobs(): with closing(db()) as conn: rows = conn.execute("SELECT * FROM jobs ORDER BY created_at DESC LIMIT 500").fetchall() return [dict(r) for r in rows] @app.get("/partials/jobs", response_class=HTMLResponse) async def partial_jobs(request: Request): with closing(db()) as conn: jobs = conn.execute(""" SELECT j.*, a.name AS acquisition_name FROM jobs j LEFT JOIN acquisitions a ON a.id = j.acquisition_id ORDER BY j.created_at DESC LIMIT 200 """).fetchall() return templates.TemplateResponse("_jobs_table.html", {"request": request, "jobs": jobs}) @app.get("/partials/monitor", response_class=HTMLResponse) async def partial_monitor(request: Request): stats = await asyncio.gather(*[_worker_stats(w) for w in WORKERS]) return templates.TemplateResponse("_monitor.html", {"request": request, "workers": stats}) async def _worker_stats(worker: dict) -> dict: alias = worker["ssh_alias"] try: proc = await asyncio.create_subprocess_exec( "ssh", "-o", "ConnectTimeout=3", "-o", "BatchMode=yes", alias, "nvidia-smi --query-gpu=memory.used,memory.total,utilization.gpu --format=csv,noheader,nounits && df -h / | tail -1", stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) out, _ = await asyncio.wait_for(proc.communicate(), timeout=4) text = out.decode().strip().splitlines() gpu_line = text[0].split(",") if text else ["?", "?", "?"] disk = text[1].split() if len(text) > 1 else ["?"] * 6 return { **worker, "online": True, "vram_used_mib": int(gpu_line[0].strip()) if gpu_line[0].strip().isdigit() else None, "vram_total_mib": int(gpu_line[1].strip()) if gpu_line[1].strip().isdigit() else None, "gpu_util_pct": int(gpu_line[2].strip()) if gpu_line[2].strip().isdigit() else None, "disk_used_pct": disk[4] if len(disk) > 4 else "?", } except Exception as e: return {**worker, "online": False, "error": str(e)[:80]} @app.post("/jobs/{job_id}/cancel") async def cancel_job(job_id: int): with closing(db()) as conn: conn.execute( "UPDATE jobs SET status='error', error='cancelled by user', finished_at=datetime('now') " "WHERE id=? AND status IN ('queued','extracting','running')", (job_id,), ) return {"ok": True} @app.post("/jobs/{job_id}/retry") async def retry_job(job_id: int): with closing(db()) as conn: conn.execute( "UPDATE jobs SET status='queued', error=NULL, progress=0, started_at=NULL, " "finished_at=NULL, worker_host=NULL WHERE id=? AND status='error'", (job_id,), ) return {"ok": True}