diff --git a/docker-compose.yml b/docker-compose.yml index 7cfa5bd..ec94b89 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -6,13 +6,11 @@ services: ports: - "3849:8000" volumes: - - cosma-qc-db:/var/lib/cosma-qc + - /home/floppyrj45/cosma-qc-data:/var/lib/cosma-qc + - /home/floppyrj45/.ssh:/ssh-in:ro environment: COSMA_QC_WORKERS: | [ - {"host":"192.168.0.87","ssh_alias":"gpu","gpu":"RTX 3060 12GB"}, - {"host":"192.168.0.84","ssh_alias":"ml-stack","gpu":"RTX 3090 24GB"} + {"host":"192.168.0.87","ssh_alias":"gpu","gpu":"RTX 3060 12GB","vram_mib":11913,"frames_dir":"/home/floppyrj45/cosma-qc-frames","lingbot_path":"/home/floppyrj45/ai-video/lingbot-map","viser_port_base":8100}, + {"host":"192.168.0.84","ssh_alias":"ml-stack","gpu":"RTX 3090 24GB","vram_mib":24576,"frames_dir":"/root/cosma-qc-frames","lingbot_path":"/root/ai-video/lingbot-map","viser_port_base":8100} ] - -volumes: - cosma-qc-db: diff --git a/scripts/__pycache__/dispatcher.cpython-311.pyc b/scripts/__pycache__/dispatcher.cpython-311.pyc new file mode 100644 index 0000000..4b2de32 Binary files /dev/null and b/scripts/__pycache__/dispatcher.cpython-311.pyc differ diff --git a/scripts/dispatcher.py b/scripts/dispatcher.py index a7195d5..1bc99dc 100644 --- a/scripts/dispatcher.py +++ b/scripts/dispatcher.py @@ -115,6 +115,10 @@ def estimate_vram_mib(frame_count: int) -> int: def set_status(job_id: int, **fields): + # Auto-clear stale error text when the job moves into a live state so the dashboard + # stops showing a previous failure alongside a fresh run. + if fields.get("status") in ("extracting", "running", "done", "queued") and "error" not in fields: + fields["error"] = None keys = list(fields.keys()) vals = [fields[k] for k in keys] q = "UPDATE jobs SET " + ", ".join(f"{k}=?" for k in keys) + " WHERE id=?" @@ -263,10 +267,11 @@ def do_reconstruct(job: sqlite3.Row, worker: dict, frames_dir: str) -> tuple[str f"for i in $(seq 1 3600); do " f" if ! kill -0 $DEMO_PID 2>/dev/null; then wait $DEMO_PID; exit $?; fi; " f" if grep -q 'PLY saved:' {log} 2>/dev/null; then " - f" sleep 2; " - f" pkill -TERM -f \"demo.py.*{frames_dir}\" 2>/dev/null; sleep 1; " - f" pkill -KILL -f \"demo.py.*{frames_dir}\" 2>/dev/null; " - f" wait $DEMO_PID 2>/dev/null; exit 0; " + # Keep demo.py alive so its viser/PointCloudViewer (with camera frustums, per-frame + # confidence filtering, animation) stays reachable. Standalone viser_ply.py only has + # XYZ+RGB which gives a poor-looking cloud. The worker eats ~6GB VRAM per alive demo.py + # until pick_worker can no longer fit a new job; _cleanup_stale_demos reaps the oldest. + f" exit 0; " f" fi; " f" sleep 3; " f"done; "