dispatcher keep demo.py alive apres PLY + auto-clear error
1. Ne plus kill demo.py apres PLY saved: son viser/PointCloudViewer natif (camera frustums, per-frame confidence filtering, animation) donne une visu bcp plus propre que viser_ply.py standalone (XYZ+RGB seul). Cout: ~6GB VRAM par job done garde alive jusquau prochain pick_worker qui peut kill si besoin. 2. set_status clear auto le champ error quand status transitionne vers extracting/running/done/queued: sinon les dashboards montrent les erreurs historiques sur les jobs en cours de retry.
This commit is contained in:
@@ -6,13 +6,11 @@ services:
|
|||||||
ports:
|
ports:
|
||||||
- "3849:8000"
|
- "3849:8000"
|
||||||
volumes:
|
volumes:
|
||||||
- cosma-qc-db:/var/lib/cosma-qc
|
- /home/floppyrj45/cosma-qc-data:/var/lib/cosma-qc
|
||||||
|
- /home/floppyrj45/.ssh:/ssh-in:ro
|
||||||
environment:
|
environment:
|
||||||
COSMA_QC_WORKERS: |
|
COSMA_QC_WORKERS: |
|
||||||
[
|
[
|
||||||
{"host":"192.168.0.87","ssh_alias":"gpu","gpu":"RTX 3060 12GB"},
|
{"host":"192.168.0.87","ssh_alias":"gpu","gpu":"RTX 3060 12GB","vram_mib":11913,"frames_dir":"/home/floppyrj45/cosma-qc-frames","lingbot_path":"/home/floppyrj45/ai-video/lingbot-map","viser_port_base":8100},
|
||||||
{"host":"192.168.0.84","ssh_alias":"ml-stack","gpu":"RTX 3090 24GB"}
|
{"host":"192.168.0.84","ssh_alias":"ml-stack","gpu":"RTX 3090 24GB","vram_mib":24576,"frames_dir":"/root/cosma-qc-frames","lingbot_path":"/root/ai-video/lingbot-map","viser_port_base":8100}
|
||||||
]
|
]
|
||||||
|
|
||||||
volumes:
|
|
||||||
cosma-qc-db:
|
|
||||||
|
|||||||
BIN
scripts/__pycache__/dispatcher.cpython-311.pyc
Normal file
BIN
scripts/__pycache__/dispatcher.cpython-311.pyc
Normal file
Binary file not shown.
@@ -115,6 +115,10 @@ def estimate_vram_mib(frame_count: int) -> int:
|
|||||||
|
|
||||||
|
|
||||||
def set_status(job_id: int, **fields):
|
def set_status(job_id: int, **fields):
|
||||||
|
# Auto-clear stale error text when the job moves into a live state so the dashboard
|
||||||
|
# stops showing a previous failure alongside a fresh run.
|
||||||
|
if fields.get("status") in ("extracting", "running", "done", "queued") and "error" not in fields:
|
||||||
|
fields["error"] = None
|
||||||
keys = list(fields.keys())
|
keys = list(fields.keys())
|
||||||
vals = [fields[k] for k in keys]
|
vals = [fields[k] for k in keys]
|
||||||
q = "UPDATE jobs SET " + ", ".join(f"{k}=?" for k in keys) + " WHERE id=?"
|
q = "UPDATE jobs SET " + ", ".join(f"{k}=?" for k in keys) + " WHERE id=?"
|
||||||
@@ -263,10 +267,11 @@ def do_reconstruct(job: sqlite3.Row, worker: dict, frames_dir: str) -> tuple[str
|
|||||||
f"for i in $(seq 1 3600); do "
|
f"for i in $(seq 1 3600); do "
|
||||||
f" if ! kill -0 $DEMO_PID 2>/dev/null; then wait $DEMO_PID; exit $?; fi; "
|
f" if ! kill -0 $DEMO_PID 2>/dev/null; then wait $DEMO_PID; exit $?; fi; "
|
||||||
f" if grep -q 'PLY saved:' {log} 2>/dev/null; then "
|
f" if grep -q 'PLY saved:' {log} 2>/dev/null; then "
|
||||||
f" sleep 2; "
|
# Keep demo.py alive so its viser/PointCloudViewer (with camera frustums, per-frame
|
||||||
f" pkill -TERM -f \"demo.py.*{frames_dir}\" 2>/dev/null; sleep 1; "
|
# confidence filtering, animation) stays reachable. Standalone viser_ply.py only has
|
||||||
f" pkill -KILL -f \"demo.py.*{frames_dir}\" 2>/dev/null; "
|
# XYZ+RGB which gives a poor-looking cloud. The worker eats ~6GB VRAM per alive demo.py
|
||||||
f" wait $DEMO_PID 2>/dev/null; exit 0; "
|
# until pick_worker can no longer fit a new job; _cleanup_stale_demos reaps the oldest.
|
||||||
|
f" exit 0; "
|
||||||
f" fi; "
|
f" fi; "
|
||||||
f" sleep 3; "
|
f" sleep 3; "
|
||||||
f"done; "
|
f"done; "
|
||||||
|
|||||||
Reference in New Issue
Block a user