diff --git a/scripts/dispatcher.py b/scripts/dispatcher.py index c073485..f740c46 100644 --- a/scripts/dispatcher.py +++ b/scripts/dispatcher.py @@ -109,7 +109,9 @@ def release_worker(worker: dict, estimated_vram_mib: int): def estimate_vram_mib(frame_count: int) -> int: - return int(3500 + 13 * frame_count) + # windowed mode + offload_to_cpu caps VRAM usage regardless of total frames. + # Observed: ~3.5 GB model + ~1.5 GB working set for window_size=16. Safe budget: 6 GB. + return 6000 def set_status(job_id: int, **fields): @@ -230,6 +232,14 @@ def do_reconstruct(job: sqlite3.Row, worker: dict, frames_dir: str) -> tuple[str log = f"/tmp/cosma-qc-job-{job['id']}.log" ckpt = f"{worker['lingbot_path']}/checkpoints/lingbot-map/lingbot-map-long.pt" ply_path = f"{frames_dir}/reconstruction.ply" + # Adaptive stride to fit CPU RAM: load_fn stacks full image tensor ~3.15 MB/frame @ 512x512x3 fp32. + # .87 has 23 GB RAM, .84 has 62 GB. Keep effective frame count ~4k to stay safe. + frame_count = job["frame_count"] or 0 + ram_gb = 23 if worker["host"] == "192.168.0.87" else 62 + ram_budget_gb = ram_gb * 0.55 + stride = 1 + while frame_count * 3.15 / 1024 / stride > ram_budget_gb: + stride += 1 # demo.py starts a viser web server after saving the PLY and never exits. # Wrap it: launch in bg, wait for "PLY saved" marker in the log, kill, exit 0. # Match on the unique job frames_dir to identify our demo.py among all children/threads. @@ -238,7 +248,7 @@ def do_reconstruct(job: sqlite3.Row, worker: dict, frames_dir: str) -> tuple[str f"cd {shlex.quote(worker['lingbot_path'])} && source .venv/bin/activate && " f"setsid python3 demo.py --model_path {shlex.quote(ckpt)} " f"--image_folder {shlex.quote(frames_dir)} --port {port} " - f"--use_sdpa --mode windowed --window_size 16 --overlap_size 2 --offload_to_cpu " + f"--stride {stride} --use_sdpa --mode windowed --window_size 16 --overlap_size 2 --offload_to_cpu " f"--save_ply {shlex.quote(ply_path)} > {log} 2>&1 & " f"DEMO_PID=$!; " f"for i in $(seq 1 3600); do "