stitch pipeline câblé : DB + dispatcher + UI + fix subpath Caddy
- Table stitches (per_auv + cross_auv) avec cancel/retry API - Dispatcher : PLY export auto (--save_ply), trigger stitch en cascade quand tous les jobs d'un AUV sont done - UI : section stitch live depuis DB avec statuts/durées/boutons - Fix : <base href="/cosma-qc/"> + chemins relatifs pour Caddy subpath - open3d 0.19.0 installé sur gpu (.87) - SSH key .82→.87 configurée, alias gpu ajouté sur .82 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
62
app/main.py
62
app/main.py
@@ -102,6 +102,24 @@ def init_schema() -> None:
|
||||
|
||||
CREATE INDEX IF NOT EXISTS jobs_status_idx ON jobs(status);
|
||||
CREATE INDEX IF NOT EXISTS jobs_acq_idx ON jobs(acquisition_id);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS stitches (
|
||||
id INTEGER PRIMARY KEY,
|
||||
acquisition_id INTEGER NOT NULL REFERENCES acquisitions(id) ON DELETE CASCADE,
|
||||
level TEXT NOT NULL DEFAULT 'per_auv',
|
||||
auv TEXT,
|
||||
input_job_ids TEXT NOT NULL DEFAULT '[]',
|
||||
input_stitch_ids TEXT NOT NULL DEFAULT '[]',
|
||||
output_ply TEXT,
|
||||
status TEXT NOT NULL DEFAULT 'queued',
|
||||
worker_host TEXT,
|
||||
started_at TEXT,
|
||||
finished_at TEXT,
|
||||
error TEXT,
|
||||
created_at TEXT NOT NULL DEFAULT (datetime('now'))
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS stitches_acq_idx ON stitches(acquisition_id);
|
||||
""")
|
||||
|
||||
|
||||
@@ -124,6 +142,10 @@ def _build_acquisitions():
|
||||
jobs = conn.execute(
|
||||
"SELECT * FROM jobs ORDER BY auv, gopro_serial, segment_label"
|
||||
).fetchall()
|
||||
stitches = conn.execute(
|
||||
"SELECT * FROM stitches ORDER BY level DESC, auv"
|
||||
).fetchall()
|
||||
|
||||
by_acq: dict[int, list[dict]] = {}
|
||||
by_acq_total: dict[int, int] = {}
|
||||
for j in jobs:
|
||||
@@ -133,12 +155,30 @@ def _build_acquisitions():
|
||||
by_acq.setdefault(j["acquisition_id"], []).append(d)
|
||||
by_acq_total[j["acquisition_id"]] = by_acq_total.get(j["acquisition_id"], 0) + dur_s
|
||||
|
||||
stitches_by_acq: dict[int, list[dict]] = {}
|
||||
for s in stitches:
|
||||
d = dict(s)
|
||||
start = _parse_ts(s["started_at"])
|
||||
end = _parse_ts(s["finished_at"]) or (
|
||||
datetime.now(timezone.utc) if s["status"] == "running" else None
|
||||
)
|
||||
if start and end:
|
||||
if start.tzinfo is None:
|
||||
start = start.replace(tzinfo=timezone.utc)
|
||||
if end.tzinfo is None:
|
||||
end = end.replace(tzinfo=timezone.utc)
|
||||
d["_duration"] = _fmt_dur(int((end - start).total_seconds()))
|
||||
else:
|
||||
d["_duration"] = ""
|
||||
stitches_by_acq.setdefault(s["acquisition_id"], []).append(d)
|
||||
|
||||
return [
|
||||
{
|
||||
"id": acq["id"],
|
||||
"name": acq["name"],
|
||||
"source_path": acq["source_path"],
|
||||
"jobs": by_acq.get(acq["id"], []),
|
||||
"stitches": stitches_by_acq.get(acq["id"], []),
|
||||
"total_duration": _fmt_dur(by_acq_total.get(acq["id"], 0)),
|
||||
}
|
||||
for acq in acqs
|
||||
@@ -220,3 +260,25 @@ async def retry_job(job_id: int):
|
||||
(job_id,),
|
||||
)
|
||||
return {"ok": True}
|
||||
|
||||
|
||||
@app.post("/stitches/{stitch_id}/cancel")
|
||||
async def cancel_stitch(stitch_id: int):
|
||||
with closing(db()) as conn:
|
||||
conn.execute(
|
||||
"UPDATE stitches SET status='error', error='cancelled by user', finished_at=datetime('now') "
|
||||
"WHERE id=? AND status IN ('queued','running')",
|
||||
(stitch_id,),
|
||||
)
|
||||
return {"ok": True}
|
||||
|
||||
|
||||
@app.post("/stitches/{stitch_id}/retry")
|
||||
async def retry_stitch(stitch_id: int):
|
||||
with closing(db()) as conn:
|
||||
conn.execute(
|
||||
"UPDATE stitches SET status='queued', error=NULL, output_ply=NULL, "
|
||||
"started_at=NULL, finished_at=NULL, worker_host=NULL WHERE id=? AND status='error'",
|
||||
(stitch_id,),
|
||||
)
|
||||
return {"ok": True}
|
||||
|
||||
1
app/static/htmx.min.js
vendored
Normal file
1
app/static/htmx.min.js
vendored
Normal file
File diff suppressed because one or more lines are too long
@@ -1,12 +1,3 @@
|
||||
{% macro duration(job) -%}
|
||||
{%- if job.started_at and job.finished_at -%}
|
||||
{{ job._duration }}
|
||||
{%- elif job.started_at and not job.finished_at -%}
|
||||
{{ job._duration }}
|
||||
{%- else -%}
|
||||
{%- endif -%}
|
||||
{%- endmacro %}
|
||||
|
||||
{% if not acquisitions %}
|
||||
<p class="muted">Aucune acquisition. Ingeste un dossier via <code>scripts/ingest.py</code>.</p>
|
||||
{% else %}
|
||||
@@ -34,26 +25,56 @@
|
||||
</span>
|
||||
<span class="dur">{{ j._duration }}</span>
|
||||
{% if j.status in ('queued','extracting','running') %}
|
||||
<button class="mini" hx-post="/jobs/{{ j.id }}/cancel" hx-target="#jobs-table">×</button>
|
||||
<button class="mini" hx-post="jobs/{{ j.id }}/cancel" hx-target="#jobs-table">×</button>
|
||||
{% elif j.status == 'error' %}
|
||||
<button class="mini" hx-post="/jobs/{{ j.id }}/retry" hx-target="#jobs-table">↻</button>
|
||||
<button class="mini" hx-post="jobs/{{ j.id }}/retry" hx-target="#jobs-table">↻</button>
|
||||
{% else %}
|
||||
<span></span>
|
||||
{% endif %}
|
||||
</li>
|
||||
{% if j.error %}<li class="err-line">{{ j.error }}</li>{% endif %}
|
||||
{% endfor %}
|
||||
</ul>
|
||||
|
||||
{# Stitch section (placeholder — wired up once multi-job stitching lands) #}
|
||||
<div class="stitch-section">
|
||||
<div class="stitch-title">
|
||||
<span class="icon"><span class="sq">■</span></span>
|
||||
<span>stitch</span>
|
||||
</div>
|
||||
<ul class="stitch-children">
|
||||
<li class="sub pending"><span class="sq">■</span> pair GP1↔GP2 per AUV</li>
|
||||
<li class="sub pending"><span class="sq">■</span> cross-AUV merge</li>
|
||||
<li class="sub pending"><span class="sq">■</span> final PLY</li>
|
||||
</ul>
|
||||
{% if acq.stitches %}
|
||||
<ul class="stitch-children">
|
||||
{% for s in acq.stitches %}
|
||||
<li class="sub {{ s.status }}">
|
||||
<span class="icon stitch-icon">
|
||||
{% if s.status == 'done' %}<span class="check ok">✓</span>
|
||||
{% elif s.status == 'running' %}<span class="spin">↻</span>
|
||||
{% elif s.status == 'error' %}<span class="err">✕</span>
|
||||
{% else %}<span class="sq">■</span>{% endif %}
|
||||
</span>
|
||||
<span>
|
||||
{% if s.level == 'per_auv' %}pair GP1↔GP2 {{ s.auv }}
|
||||
{% else %}merge final{% endif %}
|
||||
{% if s._duration %}<span class="dur muted"> — {{ s._duration }}</span>{% endif %}
|
||||
{% if s.status == 'done' and s.output_ply %}
|
||||
<span class="ext" title="{{ s.output_ply }}">PLY</span>
|
||||
{% endif %}
|
||||
</span>
|
||||
{% if s.status in ('queued','running') %}
|
||||
<button class="mini" hx-post="stitches/{{ s.id }}/cancel" hx-target="#jobs-table">×</button>
|
||||
{% elif s.status == 'error' %}
|
||||
<button class="mini" hx-post="stitches/{{ s.id }}/retry" hx-target="#jobs-table">↻</button>
|
||||
{% endif %}
|
||||
</li>
|
||||
{% if s.error %}<li class="err-line" style="padding-left:42px">{{ s.error[:120] }}</li>{% endif %}
|
||||
{% endfor %}
|
||||
</ul>
|
||||
{% else %}
|
||||
<ul class="stitch-children">
|
||||
<li class="sub pending"><span class="sq">■</span> pair GP1↔GP2 per AUV</li>
|
||||
<li class="sub pending"><span class="sq">■</span> cross-AUV merge</li>
|
||||
<li class="sub pending"><span class="sq">■</span> final PLY</li>
|
||||
</ul>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
{% endfor %}
|
||||
|
||||
@@ -4,8 +4,9 @@
|
||||
<meta charset="utf-8">
|
||||
<title>cosma-qc — dashboard</title>
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<script src="https://unpkg.com/htmx.org@2.0.4"></script>
|
||||
<link rel="stylesheet" href="/static/style.css">
|
||||
<base href="/cosma-qc/">
|
||||
<script src="static/htmx.min.js"></script>
|
||||
<link rel="stylesheet" href="static/style.css">
|
||||
</head>
|
||||
<body>
|
||||
<header>
|
||||
@@ -13,13 +14,13 @@
|
||||
<span class="sub">post-acquisition QC · lingbot-map pipeline</span>
|
||||
</header>
|
||||
|
||||
<section id="monitor" hx-get="/partials/monitor" hx-trigger="load, every 5s" hx-swap="innerHTML">
|
||||
<section id="monitor" hx-get="partials/monitor" hx-trigger="load, every 5s" hx-swap="innerHTML">
|
||||
<p class="muted">Chargement des workers…</p>
|
||||
</section>
|
||||
|
||||
<section id="jobs">
|
||||
<h2>Jobs</h2>
|
||||
<div id="jobs-table" hx-get="/partials/jobs" hx-trigger="load, every 3s" hx-swap="innerHTML">
|
||||
<div id="jobs-table" hx-get="partials/jobs" hx-trigger="load, every 3s" hx-swap="innerHTML">
|
||||
<p class="muted">Chargement…</p>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
@@ -1,9 +1,5 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Dispatcher daemon: picks queued jobs and runs them on available workers.
|
||||
|
||||
One-shot worker loop. Run as a systemd service (or manually). Handles both
|
||||
extraction (ffmpeg on the worker) and reconstruction (lingbot-map on the
|
||||
worker). Progress is written back to the DB.
|
||||
"""Dispatcher daemon: picks queued jobs/stitches and runs them on available workers.
|
||||
|
||||
Env:
|
||||
COSMA_QC_DB : SQLite path (default /var/lib/cosma-qc/jobs.db)
|
||||
@@ -14,8 +10,11 @@ Env:
|
||||
COSMA_QC_IMG_W : image width (default 518)
|
||||
|
||||
Jobs lifecycle:
|
||||
queued → extracting → running → done
|
||||
queued → extracting → running → done → [triggers per_auv stitch]
|
||||
↘ error
|
||||
Stitch lifecycle:
|
||||
queued → running → done → [triggers cross_auv stitch if all per_auv done]
|
||||
↘ error
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
@@ -40,6 +39,7 @@ FPS = int(os.environ.get("COSMA_QC_FPS", "3"))
|
||||
IMG_H = int(os.environ.get("COSMA_QC_IMG_H", "294"))
|
||||
IMG_W = int(os.environ.get("COSMA_QC_IMG_W", "518"))
|
||||
POLL_S = int(os.environ.get("COSMA_QC_POLL_S", "4"))
|
||||
STITCH_SCRIPT = Path(__file__).parent / "stitch.py"
|
||||
|
||||
DEFAULT_WORKERS = [
|
||||
{
|
||||
@@ -93,9 +93,7 @@ def pick_worker(estimated_vram_mib: int) -> dict | None:
|
||||
|
||||
|
||||
def estimate_vram_mib(frame_count: int) -> int:
|
||||
# Based on empirical: 300 frames peak ≈ 9.4 GiB, 600 frames OOM @ ~11 GiB.
|
||||
# Linear extrapolation with headroom.
|
||||
return int(3500 + 13 * frame_count) # MiB
|
||||
return int(3500 + 13 * frame_count)
|
||||
|
||||
|
||||
def set_status(job_id: int, **fields):
|
||||
@@ -106,6 +104,14 @@ def set_status(job_id: int, **fields):
|
||||
conn.execute(q, (*vals, job_id))
|
||||
|
||||
|
||||
def set_stitch_status(stitch_id: int, **fields):
|
||||
keys = list(fields.keys())
|
||||
vals = [fields[k] for k in keys]
|
||||
q = "UPDATE stitches SET " + ", ".join(f"{k}=?" for k in keys) + " WHERE id=?"
|
||||
with closing(db()) as conn:
|
||||
conn.execute(q, (*vals, stitch_id))
|
||||
|
||||
|
||||
def count_frames(worker: dict, frames_dir: str) -> int:
|
||||
rc, out, _ = ssh(worker["ssh_alias"], f"ls {shlex.quote(frames_dir)} 2>/dev/null | wc -l")
|
||||
try:
|
||||
@@ -115,7 +121,6 @@ def count_frames(worker: dict, frames_dir: str) -> int:
|
||||
|
||||
|
||||
def do_extract(job: sqlite3.Row, worker: dict) -> str:
|
||||
"""Run ffmpeg on the worker for each video in job.video_paths."""
|
||||
videos = json.loads(job["video_paths"])
|
||||
frames_dir = f"{worker['frames_dir']}/job_{job['id']}"
|
||||
ssh(worker["ssh_alias"], f"mkdir -p {shlex.quote(frames_dir)}")
|
||||
@@ -123,7 +128,6 @@ def do_extract(job: sqlite3.Row, worker: dict) -> str:
|
||||
for v in videos:
|
||||
vf = f"fps={FPS},scale={IMG_W}:{IMG_H}"
|
||||
pattern = f"{frames_dir}/frame_%06d.jpg"
|
||||
# Prepend to idx to keep frame ordering across videos.
|
||||
cmd = (
|
||||
f"ffmpeg -hide_banner -loglevel error -i {shlex.quote(v)} "
|
||||
f"-vf {shlex.quote(vf)} -start_number {idx} -q:v 4 "
|
||||
@@ -132,21 +136,22 @@ def do_extract(job: sqlite3.Row, worker: dict) -> str:
|
||||
rc, _, err = ssh(worker["ssh_alias"], cmd, timeout=3600)
|
||||
if rc != 0:
|
||||
raise RuntimeError(f"ffmpeg failed on {v}: {err[:200]}")
|
||||
# Count frames now present to bump idx
|
||||
idx = count_frames(worker, frames_dir)
|
||||
set_status(job["id"], frame_count=idx)
|
||||
return frames_dir
|
||||
|
||||
|
||||
def do_reconstruct(job: sqlite3.Row, worker: dict, frames_dir: str) -> tuple[str, str]:
|
||||
def do_reconstruct(job: sqlite3.Row, worker: dict, frames_dir: str) -> tuple[str, str, str]:
|
||||
port = worker["viser_port_base"] + job["id"]
|
||||
log = f"/tmp/cosma-qc-job-{job['id']}.log"
|
||||
ckpt = f"{worker['lingbot_path']}/checkpoints/lingbot-map/lingbot-map-long.pt"
|
||||
ply_path = f"{frames_dir}/reconstruction.ply"
|
||||
cmd = (
|
||||
f"cd {shlex.quote(worker['lingbot_path'])} && source .venv/bin/activate && "
|
||||
f"python3 demo.py --model_path {shlex.quote(ckpt)} "
|
||||
f"--image_folder {shlex.quote(frames_dir)} --port {port} "
|
||||
f"--use_sdpa --mode windowed --window_size 16 --overlap_size 2 --offload_to_cpu "
|
||||
f"--save_ply {shlex.quote(ply_path)} "
|
||||
f"> {log} 2>&1"
|
||||
)
|
||||
rc, _, err = ssh(worker["ssh_alias"], cmd, timeout=3 * 3600)
|
||||
@@ -154,7 +159,138 @@ def do_reconstruct(job: sqlite3.Row, worker: dict, frames_dir: str) -> tuple[str
|
||||
tail = ssh(worker["ssh_alias"], f"tail -30 {log}")[1]
|
||||
raise RuntimeError(f"demo.py failed: {err[:200]}\n---\n{tail[:800]}")
|
||||
viser_url = f"http://{worker['host']}:{port}"
|
||||
return viser_url, log
|
||||
return viser_url, log, ply_path
|
||||
|
||||
|
||||
def _maybe_create_per_auv_stitch(job_id: int):
|
||||
with closing(db()) as conn:
|
||||
job = conn.execute("SELECT * FROM jobs WHERE id=?", (job_id,)).fetchone()
|
||||
if not job:
|
||||
return
|
||||
acq_id, auv = job["acquisition_id"], job["auv"]
|
||||
total = conn.execute(
|
||||
"SELECT COUNT(*) FROM jobs WHERE acquisition_id=? AND auv=?", (acq_id, auv)
|
||||
).fetchone()[0]
|
||||
done = conn.execute(
|
||||
"SELECT COUNT(*) FROM jobs WHERE acquisition_id=? AND auv=? AND status='done'", (acq_id, auv)
|
||||
).fetchone()[0]
|
||||
if total == 0 or done < total:
|
||||
return
|
||||
existing = conn.execute(
|
||||
"SELECT id FROM stitches WHERE acquisition_id=? AND level='per_auv' AND auv=?", (acq_id, auv)
|
||||
).fetchone()
|
||||
if existing:
|
||||
return
|
||||
job_ids = [r["id"] for r in conn.execute(
|
||||
"SELECT id FROM jobs WHERE acquisition_id=? AND auv=?", (acq_id, auv)
|
||||
).fetchall()]
|
||||
conn.execute(
|
||||
"INSERT INTO stitches (acquisition_id, level, auv, input_job_ids) VALUES (?,?,?,?)",
|
||||
(acq_id, "per_auv", auv, json.dumps(job_ids))
|
||||
)
|
||||
print(f" → Stitch per_auv créé pour {auv} acq#{acq_id}")
|
||||
|
||||
|
||||
def _maybe_create_cross_auv_stitch(stitch_id: int):
|
||||
with closing(db()) as conn:
|
||||
st = conn.execute("SELECT * FROM stitches WHERE id=?", (stitch_id,)).fetchone()
|
||||
if not st:
|
||||
return
|
||||
acq_id = st["acquisition_id"]
|
||||
n_auvs = conn.execute(
|
||||
"SELECT COUNT(DISTINCT auv) FROM jobs WHERE acquisition_id=?", (acq_id,)
|
||||
).fetchone()[0]
|
||||
if n_auvs < 2:
|
||||
return
|
||||
total_per_auv = conn.execute(
|
||||
"SELECT COUNT(*) FROM stitches WHERE acquisition_id=? AND level='per_auv'", (acq_id,)
|
||||
).fetchone()[0]
|
||||
done_per_auv = conn.execute(
|
||||
"SELECT COUNT(*) FROM stitches WHERE acquisition_id=? AND level='per_auv' AND status='done'", (acq_id,)
|
||||
).fetchone()[0]
|
||||
if total_per_auv == 0 or done_per_auv < n_auvs:
|
||||
return
|
||||
existing = conn.execute(
|
||||
"SELECT id FROM stitches WHERE acquisition_id=? AND level='cross_auv'", (acq_id,)
|
||||
).fetchone()
|
||||
if existing:
|
||||
return
|
||||
stitch_ids = [r["id"] for r in conn.execute(
|
||||
"SELECT id FROM stitches WHERE acquisition_id=? AND level='per_auv'", (acq_id,)
|
||||
).fetchall()]
|
||||
conn.execute(
|
||||
"INSERT INTO stitches (acquisition_id, level, input_stitch_ids, input_job_ids) VALUES (?,?,?,?)",
|
||||
(acq_id, "cross_auv", json.dumps(stitch_ids), "[]")
|
||||
)
|
||||
print(f" → Stitch cross_auv créé pour acq#{acq_id}")
|
||||
|
||||
|
||||
def deploy_stitch_script(worker: dict):
|
||||
subprocess.run(
|
||||
["scp", str(STITCH_SCRIPT), f"{worker['ssh_alias']}:/tmp/cosma-stitch.py"],
|
||||
capture_output=True, timeout=30
|
||||
)
|
||||
|
||||
|
||||
def run_one_stitch(stitch: sqlite3.Row):
|
||||
stitch_id = stitch["id"]
|
||||
worker = pick_worker(2000)
|
||||
if not worker:
|
||||
worker = WORKERS[0]
|
||||
|
||||
with closing(db()) as conn:
|
||||
if stitch["level"] == "per_auv":
|
||||
job_ids = json.loads(stitch["input_job_ids"] or "[]")
|
||||
if job_ids:
|
||||
rows = conn.execute(
|
||||
f"SELECT ply_path FROM jobs WHERE id IN ({','.join('?'*len(job_ids))})",
|
||||
job_ids
|
||||
).fetchall()
|
||||
else:
|
||||
rows = []
|
||||
ply_paths = [r["ply_path"] for r in rows if r["ply_path"]]
|
||||
else:
|
||||
stitch_ids = json.loads(stitch["input_stitch_ids"] or "[]")
|
||||
if stitch_ids:
|
||||
rows = conn.execute(
|
||||
f"SELECT output_ply FROM stitches WHERE id IN ({','.join('?'*len(stitch_ids))})",
|
||||
stitch_ids
|
||||
).fetchall()
|
||||
else:
|
||||
rows = []
|
||||
ply_paths = [r["output_ply"] for r in rows if r["output_ply"]]
|
||||
|
||||
if len(ply_paths) < 2:
|
||||
set_stitch_status(stitch_id, status="error",
|
||||
error=f"Pas assez de PLY disponibles ({len(ply_paths)})",
|
||||
finished_at=_now_iso())
|
||||
return
|
||||
|
||||
out_ply = f"{worker['frames_dir']}/stitch_{stitch_id}.ply"
|
||||
deploy_stitch_script(worker)
|
||||
|
||||
cmd = (
|
||||
f"source {shlex.quote(worker['lingbot_path'])}/.venv/bin/activate && "
|
||||
f"python3 /tmp/cosma-stitch.py {shlex.quote(out_ply)} "
|
||||
+ " ".join(shlex.quote(p) for p in ply_paths)
|
||||
+ f" > /tmp/cosma-stitch-{stitch_id}.log 2>&1"
|
||||
)
|
||||
|
||||
set_stitch_status(stitch_id, status="running", worker_host=worker["host"], started_at=_now_iso())
|
||||
try:
|
||||
rc, _, err = ssh(worker["ssh_alias"], cmd, timeout=4 * 3600)
|
||||
except Exception as e:
|
||||
set_stitch_status(stitch_id, status="error", error=str(e)[:500], finished_at=_now_iso())
|
||||
return
|
||||
|
||||
if rc == 0:
|
||||
set_stitch_status(stitch_id, status="done", output_ply=out_ply, finished_at=_now_iso())
|
||||
_maybe_create_cross_auv_stitch(stitch_id)
|
||||
else:
|
||||
tail = ssh(worker["ssh_alias"], f"tail -20 /tmp/cosma-stitch-{stitch_id}.log")[1]
|
||||
set_stitch_status(stitch_id, status="error",
|
||||
error=f"{err[:200]}\n{tail[:600]}",
|
||||
finished_at=_now_iso())
|
||||
|
||||
|
||||
def run_one(job: sqlite3.Row):
|
||||
@@ -162,21 +298,19 @@ def run_one(job: sqlite3.Row):
|
||||
estimated = estimate_vram_mib(job["frame_count"] or 400)
|
||||
worker = pick_worker(estimated)
|
||||
if not worker:
|
||||
return # retry later
|
||||
set_status(job_id, status="extracting", worker_host=worker["host"],
|
||||
started_at=_now_iso())
|
||||
return
|
||||
set_status(job_id, status="extracting", worker_host=worker["host"], started_at=_now_iso())
|
||||
try:
|
||||
frames_dir = do_extract(job, worker)
|
||||
frame_count = count_frames(worker, frames_dir)
|
||||
set_status(job_id, frames_dir=frames_dir, frame_count=frame_count,
|
||||
status="running", progress=0)
|
||||
viser_url, log = do_reconstruct(job, worker, frames_dir)
|
||||
set_status(job_id, status="done", viser_url=viser_url, progress=100,
|
||||
log_tail=log,
|
||||
finished_at=_now_iso())
|
||||
viser_url, log, ply_path = do_reconstruct(job, worker, frames_dir)
|
||||
set_status(job_id, status="done", viser_url=viser_url, ply_path=ply_path,
|
||||
progress=100, log_tail=log, finished_at=_now_iso())
|
||||
_maybe_create_per_auv_stitch(job_id)
|
||||
except Exception as e:
|
||||
set_status(job_id, status="error", error=str(e)[:2000],
|
||||
finished_at=_now_iso())
|
||||
set_status(job_id, status="error", error=str(e)[:2000], finished_at=_now_iso())
|
||||
|
||||
|
||||
def pop_queued() -> sqlite3.Row | None:
|
||||
@@ -186,14 +320,28 @@ def pop_queued() -> sqlite3.Row | None:
|
||||
).fetchone()
|
||||
|
||||
|
||||
def pop_queued_stitch() -> sqlite3.Row | None:
|
||||
with closing(db()) as conn:
|
||||
return conn.execute(
|
||||
"SELECT * FROM stitches WHERE status='queued' ORDER BY created_at LIMIT 1"
|
||||
).fetchone()
|
||||
|
||||
|
||||
def main():
|
||||
print(f"cosma-qc dispatcher · DB={DB_PATH} · workers={[w['host'] for w in WORKERS]}")
|
||||
while True:
|
||||
job = pop_queued()
|
||||
if job is None:
|
||||
time.sleep(POLL_S); continue
|
||||
print(f"→ picking up job #{job['id']} ({job['auv']}/{job['gopro_serial']}/{job['segment_label']})")
|
||||
run_one(job)
|
||||
if job:
|
||||
print(f"→ job #{job['id']} ({job['auv']}/{job['gopro_serial']}/{job['segment_label']})")
|
||||
run_one(job)
|
||||
continue
|
||||
stitch = pop_queued_stitch()
|
||||
if stitch:
|
||||
label = f"{stitch['level']} {stitch['auv'] or ''} acq#{stitch['acquisition_id']}"
|
||||
print(f"→ stitch #{stitch['id']} ({label})")
|
||||
run_one_stitch(stitch)
|
||||
continue
|
||||
time.sleep(POLL_S)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user