From 3eb568f14ee116bf5bce01dc118e00f8c2b52cbf Mon Sep 17 00:00:00 2001 From: Flag Date: Wed, 22 Apr 2026 15:39:56 +0000 Subject: [PATCH] =?UTF-8?q?dispatcher=20=C2=97=20rm=20worker=5Fsrc=20apres?= =?UTF-8?q?=20extract=20+=20fstrim=20pour=20eviter=20thin=20pool=20full?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Le cache src_*.MP4 sur les workers s empile: 12 fichiers pour 82 GB au pire. Le thin pool LVM sur le host Proxmox est trop petit (810 GB pour 1144 GB thick-provisionned) et se remplit a 100% en quelques heures de pipeline -> I/O errors -> VMs auto-paused -> tout casse. Fix: delete src_*.MP4 immediatement apres count_frames (les frames sont deja extraites), puis fstrim en fin de job pour que le thin pool reclaim les blocks immediatement via DISCARD/UNMAP. --- scripts/dispatcher.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/dispatcher.py b/scripts/dispatcher.py index c42e3c2..3a6d21d 100644 --- a/scripts/dispatcher.py +++ b/scripts/dispatcher.py @@ -225,7 +225,12 @@ def do_extract(job: sqlite3.Row, worker: dict) -> str: _, err, _ = ssh(worker["ssh_alias"], f"cat /tmp/cosma-ffmpeg-{job['id']}.log 2>/dev/null | tail -5 || echo ''") raise RuntimeError(f"ffmpeg failed on {v}: {err[:200]}") idx = count_frames(worker, frames_dir) + # Free MP4 cache immediately: thin pool on Proxmox host is tight and src_*.MP4 + # are 1-11 GB each. Frames are already extracted so worker_src is no longer needed. + ssh(worker["ssh_alias"], f"rm -f {shlex.quote(worker_src)}") set_status(job["id"], frame_count=idx, progress=min(99, idx * 100 // total_frames_est)) + # Trim once per job so LVM thin pool on the host actually reclaims the freed blocks. + ssh(worker["ssh_alias"], "sudo fstrim / 2>/dev/null || fstrim / 2>/dev/null", timeout=60) return frames_dir