dispatcher — clean frames avant extract + budget RAM 0.55 -> 0.45

Bugs decouverts en live:
1. Les retries/restarts ne cleanaient pas frames_dir -> ffmpeg re-extrayait
   par dessus les anciennes -> frame_count inflate (ex: 21991 au lieu de
   11000) -> budget stride fausse -> OOM.
2. Budget 0.55*RAM laissait pas assez de headroom (OS + CUDA pinned buffers
   + autres processes) -> kill -9 a la limite. 0.45 plus conservateur.
This commit is contained in:
Flag
2026-04-22 19:25:56 +00:00
parent 7630e72dcb
commit cfbb542992

View File

@@ -184,7 +184,9 @@ def video_duration_s(worker: dict, worker_src: str) -> float:
def do_extract(job: sqlite3.Row, worker: dict) -> str: def do_extract(job: sqlite3.Row, worker: dict) -> str:
videos = json.loads(job["video_paths"]) videos = json.loads(job["video_paths"])
frames_dir = f"{worker['frames_dir']}/job_{job['id']}" frames_dir = f"{worker['frames_dir']}/job_{job['id']}"
ssh(worker["ssh_alias"], f"mkdir -p {shlex.quote(frames_dir)}") # Clean any frame_*.jpg from a prior run so count_frames reflects this extraction only
# (retries/restarts otherwise inflate frame_count with duplicates).
ssh(worker["ssh_alias"], f"mkdir -p {shlex.quote(frames_dir)} && rm -f {shlex.quote(frames_dir)}/frame_*.jpg")
idx = 0 idx = 0
total_frames_est = 0 # will be computed after each scp total_frames_est = 0 # will be computed after each scp
for v in videos: for v in videos:
@@ -243,7 +245,7 @@ def do_reconstruct(job: sqlite3.Row, worker: dict, frames_dir: str) -> tuple[str
# .87 has 23 GB RAM, .84 has 62 GB. Keep effective frame count ~4k to stay safe. # .87 has 23 GB RAM, .84 has 62 GB. Keep effective frame count ~4k to stay safe.
frame_count = job["frame_count"] or 0 frame_count = job["frame_count"] or 0
ram_gb = 23 if worker["host"] == "192.168.0.87" else 62 ram_gb = 23 if worker["host"] == "192.168.0.87" else 62
ram_budget_gb = ram_gb * 0.55 ram_budget_gb = ram_gb * 0.45 # leave headroom for model + OS + cuda pinned buffers
stride = 1 stride = 1
while frame_count * 3.15 / 1024 / stride > ram_budget_gb: while frame_count * 3.15 / 1024 / stride > ram_budget_gb:
stride += 1 stride += 1