dispatcher keep demo.py alive apres PLY + auto-clear error
1. Ne plus kill demo.py apres PLY saved: son viser/PointCloudViewer natif (camera frustums, per-frame confidence filtering, animation) donne une visu bcp plus propre que viser_ply.py standalone (XYZ+RGB seul). Cout: ~6GB VRAM par job done garde alive jusquau prochain pick_worker qui peut kill si besoin. 2. set_status clear auto le champ error quand status transitionne vers extracting/running/done/queued: sinon les dashboards montrent les erreurs historiques sur les jobs en cours de retry.
This commit is contained in:
@@ -6,13 +6,11 @@ services:
|
||||
ports:
|
||||
- "3849:8000"
|
||||
volumes:
|
||||
- cosma-qc-db:/var/lib/cosma-qc
|
||||
- /home/floppyrj45/cosma-qc-data:/var/lib/cosma-qc
|
||||
- /home/floppyrj45/.ssh:/ssh-in:ro
|
||||
environment:
|
||||
COSMA_QC_WORKERS: |
|
||||
[
|
||||
{"host":"192.168.0.87","ssh_alias":"gpu","gpu":"RTX 3060 12GB"},
|
||||
{"host":"192.168.0.84","ssh_alias":"ml-stack","gpu":"RTX 3090 24GB"}
|
||||
{"host":"192.168.0.87","ssh_alias":"gpu","gpu":"RTX 3060 12GB","vram_mib":11913,"frames_dir":"/home/floppyrj45/cosma-qc-frames","lingbot_path":"/home/floppyrj45/ai-video/lingbot-map","viser_port_base":8100},
|
||||
{"host":"192.168.0.84","ssh_alias":"ml-stack","gpu":"RTX 3090 24GB","vram_mib":24576,"frames_dir":"/root/cosma-qc-frames","lingbot_path":"/root/ai-video/lingbot-map","viser_port_base":8100}
|
||||
]
|
||||
|
||||
volumes:
|
||||
cosma-qc-db:
|
||||
|
||||
BIN
scripts/__pycache__/dispatcher.cpython-311.pyc
Normal file
BIN
scripts/__pycache__/dispatcher.cpython-311.pyc
Normal file
Binary file not shown.
@@ -115,6 +115,10 @@ def estimate_vram_mib(frame_count: int) -> int:
|
||||
|
||||
|
||||
def set_status(job_id: int, **fields):
|
||||
# Auto-clear stale error text when the job moves into a live state so the dashboard
|
||||
# stops showing a previous failure alongside a fresh run.
|
||||
if fields.get("status") in ("extracting", "running", "done", "queued") and "error" not in fields:
|
||||
fields["error"] = None
|
||||
keys = list(fields.keys())
|
||||
vals = [fields[k] for k in keys]
|
||||
q = "UPDATE jobs SET " + ", ".join(f"{k}=?" for k in keys) + " WHERE id=?"
|
||||
@@ -263,10 +267,11 @@ def do_reconstruct(job: sqlite3.Row, worker: dict, frames_dir: str) -> tuple[str
|
||||
f"for i in $(seq 1 3600); do "
|
||||
f" if ! kill -0 $DEMO_PID 2>/dev/null; then wait $DEMO_PID; exit $?; fi; "
|
||||
f" if grep -q 'PLY saved:' {log} 2>/dev/null; then "
|
||||
f" sleep 2; "
|
||||
f" pkill -TERM -f \"demo.py.*{frames_dir}\" 2>/dev/null; sleep 1; "
|
||||
f" pkill -KILL -f \"demo.py.*{frames_dir}\" 2>/dev/null; "
|
||||
f" wait $DEMO_PID 2>/dev/null; exit 0; "
|
||||
# Keep demo.py alive so its viser/PointCloudViewer (with camera frustums, per-frame
|
||||
# confidence filtering, animation) stays reachable. Standalone viser_ply.py only has
|
||||
# XYZ+RGB which gives a poor-looking cloud. The worker eats ~6GB VRAM per alive demo.py
|
||||
# until pick_worker can no longer fit a new job; _cleanup_stale_demos reaps the oldest.
|
||||
f" exit 0; "
|
||||
f" fi; "
|
||||
f" sleep 3; "
|
||||
f"done; "
|
||||
|
||||
Reference in New Issue
Block a user