Compare commits

...

2 Commits

Author SHA1 Message Date
Poulpe
5ead87d59c fix(05-inference): min_frames guard + configurable timeout
- Skip segments with < min_frames_for_inference (32) frames — prevents
  RoPE/attention tensor mismatch (GX029838: 20 frames)
- Timeout now reads inference_timeout_s from thresholds.yaml (default 3h)
  GX029818 (493 frames) timed out at 7200s — raised to 10800

Authored-by: Poulpe <claude@nowyouknow.fr>
2026-05-13 10:37:04 +00:00
Poulpe
c7c4431e72 auto-iter 2026-05-13: inference min_frames=32 + timeout 3h (was 2h)
- min_frames_for_inference: 32 (RoPE/attention needs ≥32 frames)
- inference_timeout_s: 10800 (GX029818 timed out at 7200s with 493 frames)

Authored-by: Poulpe <claude@nowyouknow.fr>
2026-05-13 10:36:28 +00:00
2 changed files with 17 additions and 1 deletions

View File

@@ -22,6 +22,8 @@ inference:
max_frame_num: 1024 max_frame_num: 1024
mode: streaming mode: streaming
keyframe_interval: 1 keyframe_interval: 1
min_frames_for_inference: 32 # fewer frames → RoPE/attention mismatch errors
inference_timeout_s: 10800 # 3h (was 7200=2h, GX029818 timed out with 493 frames)
align: align:
max_translation_m: 500 # sanity check on alignment max_translation_m: 500 # sanity check on alignment

View File

@@ -195,9 +195,10 @@ def run_inference(frames_dir: Path, worker_key: str, mission_name: str,
print(f" [05] Launching inference on {host}...") print(f" [05] Launching inference on {host}...")
t0 = time.time() t0 = time.time()
inf_timeout = int(_INF_CFG.get("inference_timeout_s", 10800))
r = subprocess.run( r = subprocess.run(
["ssh", "-o", "StrictHostKeyChecking=no", ssh_target, demo_cmd], ["ssh", "-o", "StrictHostKeyChecking=no", ssh_target, demo_cmd],
capture_output=True, text=True, timeout=7200, # 2h max capture_output=True, text=True, timeout=inf_timeout,
) )
elapsed = time.time() - t0 elapsed = time.time() - t0
metrics["inference_s"] = round(elapsed, 1) metrics["inference_s"] = round(elapsed, 1)
@@ -265,6 +266,19 @@ def process_frames_dir(frames_dir: Path, worker_key: str, mission_name: str) ->
if not frames: if not frames:
continue continue
print(f"\n[05] === {auv_id}/{seg_dir.name}: {len(frames)} frames ===") print(f"\n[05] === {auv_id}/{seg_dir.name}: {len(frames)} frames ===")
# Guard: min frames required for model (RoPE/attention)
min_frames = int(_INF_CFG.get("min_frames_for_inference", 32))
if len(frames) < min_frames:
print(f" [05] SKIP {auv_id}/{seg_dir.name}: {len(frames)} frames < {min_frames} min")
init_db()
with get_conn() as conn_mf:
mr = conn_mf.execute("SELECT id FROM missions WHERE name=?", (mission_name,)).fetchone()
if mr:
upsert_job(conn_mf, mr["id"], auv_id, seg_dir.name, "05_inference",
status="skipped",
error_msg=f"frames_too_few={len(frames)}<{min_frames}")
continue
m = run_inference(seg_dir, worker_key, mission_name, auv_id, seg_dir.name) m = run_inference(seg_dir, worker_key, mission_name, auv_id, seg_dir.name)
all_metrics.append(m) all_metrics.append(m)