dispatcher+ingest — fixes pour test end-to-end réel

- dispatcher: scp du MP4 source vers le worker avant ffmpeg (les chemins .82 ne sont pas accessibles côté .87)
- dispatcher: wrapper shell autour de demo.py pour killer viser dès que le PLY est écrit (setsid + pkill -f frames_dir)
- dispatcher: PLY_ok fallback — accepte rc!=0 si le PLY existe et a une taille > 0
- dispatcher: fallback frame_count abaissé à 150 pour l'estimation VRAM
- ingest: strip du suffixe timezone (+00:00) des timestamps exiftool QuickTimeUTC=1

Testé bout-en-bout sur GX010001.MP4 (70 frames, 10.6M pts PLY, VRAM peak 9.4 GB, kill viser OK).
This commit is contained in:
Poulpe
2026-04-21 12:49:09 +00:00
parent 654bb47825
commit 69eb547463
2 changed files with 49 additions and 8 deletions

View File

@@ -120,6 +120,15 @@ def count_frames(worker: dict, frames_dir: str) -> int:
return 0 return 0
def scp_to_worker(local_path: str, worker: dict, remote_path: str):
r = subprocess.run(
["scp", "-o", "BatchMode=yes", local_path, f"{worker['ssh_alias']}:{remote_path}"],
capture_output=True, timeout=1800,
)
if r.returncode != 0:
raise RuntimeError(f"scp failed: {r.stderr.decode()[:200]}")
def do_extract(job: sqlite3.Row, worker: dict) -> str: def do_extract(job: sqlite3.Row, worker: dict) -> str:
videos = json.loads(job["video_paths"]) videos = json.loads(job["video_paths"])
frames_dir = f"{worker['frames_dir']}/job_{job['id']}" frames_dir = f"{worker['frames_dir']}/job_{job['id']}"
@@ -128,8 +137,14 @@ def do_extract(job: sqlite3.Row, worker: dict) -> str:
for v in videos: for v in videos:
vf = f"fps={FPS},scale={IMG_W}:{IMG_H}" vf = f"fps={FPS},scale={IMG_W}:{IMG_H}"
pattern = f"{frames_dir}/frame_%06d.jpg" pattern = f"{frames_dir}/frame_%06d.jpg"
# Copy video to worker if it doesn't exist there
worker_src = f"{frames_dir}/src_{Path(v).name}"
rc_check = ssh(worker["ssh_alias"], f"test -f {shlex.quote(worker_src)}")[0]
if rc_check != 0:
print(f" scp {Path(v).name}{worker['host']}...")
scp_to_worker(v, worker, worker_src)
cmd = ( cmd = (
f"ffmpeg -hide_banner -loglevel error -i {shlex.quote(v)} " f"ffmpeg -hide_banner -loglevel error -i {shlex.quote(worker_src)} "
f"-vf {shlex.quote(vf)} -start_number {idx} -q:v 4 " f"-vf {shlex.quote(vf)} -start_number {idx} -q:v 4 "
f"{shlex.quote(pattern)}" f"{shlex.quote(pattern)}"
) )
@@ -146,18 +161,39 @@ def do_reconstruct(job: sqlite3.Row, worker: dict, frames_dir: str) -> tuple[str
log = f"/tmp/cosma-qc-job-{job['id']}.log" log = f"/tmp/cosma-qc-job-{job['id']}.log"
ckpt = f"{worker['lingbot_path']}/checkpoints/lingbot-map/lingbot-map-long.pt" ckpt = f"{worker['lingbot_path']}/checkpoints/lingbot-map/lingbot-map-long.pt"
ply_path = f"{frames_dir}/reconstruction.ply" ply_path = f"{frames_dir}/reconstruction.ply"
# demo.py starts a viser web server after saving the PLY and never exits.
# Wrap it: launch in bg, wait for "PLY saved" marker in the log, kill, exit 0.
# Match on the unique job frames_dir to identify our demo.py among all children/threads.
marker = shlex.quote(frames_dir)
cmd = ( cmd = (
f"cd {shlex.quote(worker['lingbot_path'])} && source .venv/bin/activate && " f"cd {shlex.quote(worker['lingbot_path'])} && source .venv/bin/activate && "
f"python3 demo.py --model_path {shlex.quote(ckpt)} " f"setsid python3 demo.py --model_path {shlex.quote(ckpt)} "
f"--image_folder {shlex.quote(frames_dir)} --port {port} " f"--image_folder {shlex.quote(frames_dir)} --port {port} "
f"--use_sdpa --mode windowed --window_size 16 --overlap_size 2 --offload_to_cpu " f"--use_sdpa --mode windowed --window_size 16 --overlap_size 2 --offload_to_cpu "
f"--save_ply {shlex.quote(ply_path)} " f"--save_ply {shlex.quote(ply_path)} > {log} 2>&1 & "
f"> {log} 2>&1" f"DEMO_PID=$!; "
f"for i in $(seq 1 3600); do "
f" if ! kill -0 $DEMO_PID 2>/dev/null; then wait $DEMO_PID; exit $?; fi; "
f" if grep -q 'PLY saved:' {log} 2>/dev/null; then "
f" sleep 2; "
f" pkill -TERM -f \"demo.py.*{frames_dir}\" 2>/dev/null; sleep 1; "
f" pkill -KILL -f \"demo.py.*{frames_dir}\" 2>/dev/null; "
f" wait $DEMO_PID 2>/dev/null; exit 0; "
f" fi; "
f" sleep 3; "
f"done; "
f"pkill -KILL -f \"demo.py.*{frames_dir}\" 2>/dev/null; exit 124"
) )
rc, _, err = ssh(worker["ssh_alias"], cmd, timeout=3 * 3600) rc, _, err = ssh(worker["ssh_alias"], cmd, timeout=3 * 3600)
if rc != 0: # Accept rc==0 OR PLY file exists with non-zero size (kill -TERM may return non-zero)
ply_rc, ply_size, _ = ssh(worker["ssh_alias"], f"stat -c %s {shlex.quote(ply_path)} 2>/dev/null || echo 0")
try:
ply_ok = int(ply_size.strip()) > 0
except ValueError:
ply_ok = False
if not ply_ok:
tail = ssh(worker["ssh_alias"], f"tail -30 {log}")[1] tail = ssh(worker["ssh_alias"], f"tail -30 {log}")[1]
raise RuntimeError(f"demo.py failed: {err[:200]}\n---\n{tail[:800]}") raise RuntimeError(f"demo.py failed (rc={rc}): {err[:200]}\n---\n{tail[:800]}")
viser_url = f"http://{worker['host']}:{port}" viser_url = f"http://{worker['host']}:{port}"
return viser_url, log, ply_path return viser_url, log, ply_path
@@ -296,7 +332,7 @@ def run_one_stitch(stitch: sqlite3.Row):
def run_one(job: sqlite3.Row) -> bool: def run_one(job: sqlite3.Row) -> bool:
"""Returns True if a worker was picked and work started.""" """Returns True if a worker was picked and work started."""
job_id = job["id"] job_id = job["id"]
estimated = estimate_vram_mib(job["frame_count"] or 400) estimated = estimate_vram_mib(job["frame_count"] or 150)
worker = pick_worker(estimated) worker = pick_worker(estimated)
if not worker: if not worker:
return False return False

View File

@@ -34,7 +34,12 @@ def exif_create_date(path: Path) -> datetime | None:
["exiftool", "-s3", "-CreateDate", "-api", "QuickTimeUTC=1", str(path)], ["exiftool", "-s3", "-CreateDate", "-api", "QuickTimeUTC=1", str(path)],
stderr=subprocess.DEVNULL, text=True, timeout=10, stderr=subprocess.DEVNULL, text=True, timeout=10,
).strip() ).strip()
return datetime.strptime(out, "%Y:%m:%d %H:%M:%S") if out else None if not out:
return None
# Strip timezone suffix (+HH:MM or -HH:MM) if present
import re as _re
out = _re.sub(r'[+-]\d{2}:\d{2}$', '', out).strip()
return datetime.strptime(out, "%Y:%m:%d %H:%M:%S")
except Exception: except Exception:
return None return None