feat: hook post-job cosma-nav + style dashboard + docker-compose update

This commit is contained in:
Ubuntu
2026-04-25 16:25:19 +00:00
parent ffcb254fa8
commit 4db7b13bb0
4 changed files with 64 additions and 10 deletions

View File

@@ -806,6 +806,43 @@ def run_one_stitch(stitch: sqlite3.Row):
finished_at=_now_iso())
ML_STACK_HOST = "192.168.0.84"
ML_STACK_ALIAS = "ml-stack"
_PRE_DECIMATE = "/root/cosma-nav/scripts/pre_decimate.py"
_ARCHIVE_SH = "/root/cosma-nav/scripts/archive_job.sh"
def _post_job_qc_sync(job_id: int, worker: dict, frames_dir: str):
"""Fire-and-forget: decimate PLY + archive to NAS after a successful job.
Only runs when the worker is ml-stack (.84) where the scripts live.
"""
if worker["host"] != ML_STACK_HOST:
print(f" post_job #{job_id}: worker={worker['host']} != ml-stack, skip QC sync", flush=True)
return
alias = ML_STACK_ALIAS
parent = str(Path(frames_dir).parent)
pre_cmd = (
f"python3 {_PRE_DECIMATE} {job_id} "
f"--frames-dir {shlex.quote(parent)} "
f"> /tmp/pre_decimate_{job_id}.log 2>&1"
)
rc_pre, _, _ = ssh(alias, pre_cmd, timeout=600)
if rc_pre == 0:
print(f" post_job #{job_id}: pre_decimate OK", flush=True)
else:
tail = ssh(alias, f"tail -5 /tmp/pre_decimate_{job_id}.log")[1]
print(f" post_job #{job_id}: pre_decimate FAIL: {tail[:300]}", flush=True)
arc_cmd = f"bash {_ARCHIVE_SH} {job_id} > /tmp/archive_{job_id}.log 2>&1"
rc_arc, _, _ = ssh(alias, arc_cmd, timeout=600)
if rc_arc == 0:
print(f" post_job #{job_id}: archive OK", flush=True)
else:
tail = ssh(alias, f"tail -5 /tmp/archive_{job_id}.log")[1]
print(f" post_job #{job_id}: archive FAIL: {tail[:300]}", flush=True)
def run_one(job: sqlite3.Row) -> bool:
"""Returns True if a worker was picked and work started."""
job_id = job["id"]
@@ -825,6 +862,7 @@ def run_one(job: sqlite3.Row) -> bool:
set_status(job_id, status="done", viser_url=viser_url, ply_path=ply_path,
progress=100, log_tail=log, finished_at=_now_iso())
_maybe_create_per_auv_stitch(job_id)
threading.Thread(target=_post_job_qc_sync, args=(job_id, worker, frames_dir), daemon=True).start()
except Exception as e:
# do_extract raises "skipped_short" after flagging status='skipped' — don't override.
if "skipped_short" not in str(e):