From 3eb568f14ee116bf5bce01dc118e00f8c2b52cbf Mon Sep 17 00:00:00 2001
From: Flag <flag@laboratoire>
Date: Wed, 22 Apr 2026 15:39:56 +0000
Subject: [PATCH] =?UTF-8?q?dispatcher=20=C2=97=20rm=20worker=5Fsrc=20apres?=
 =?UTF-8?q?=20extract=20+=20fstrim=20pour=20eviter=20thin=20pool=20full?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Le cache src_*.MP4 sur les workers s empile: 12 fichiers pour 82 GB au pire.
Le thin pool LVM sur le host Proxmox est trop petit (810 GB pour 1144 GB
thick-provisionned) et se remplit a 100% en quelques heures de pipeline
-> I/O errors -> VMs auto-paused -> tout casse.

Fix: delete src_*.MP4 immediatement apres count_frames (les frames sont
deja extraites), puis fstrim en fin de job pour que le thin pool reclaim
les blocks immediatement via DISCARD/UNMAP.
---
 scripts/dispatcher.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/scripts/dispatcher.py b/scripts/dispatcher.py
index c42e3c2..3a6d21d 100644
--- a/scripts/dispatcher.py
+++ b/scripts/dispatcher.py
@@ -225,7 +225,12 @@ def do_extract(job: sqlite3.Row, worker: dict) -> str:
             _, err, _ = ssh(worker["ssh_alias"], f"cat /tmp/cosma-ffmpeg-{job['id']}.log 2>/dev/null | tail -5 || echo ''")
             raise RuntimeError(f"ffmpeg failed on {v}: {err[:200]}")
         idx = count_frames(worker, frames_dir)
+        # Free MP4 cache immediately: thin pool on Proxmox host is tight and src_*.MP4
+        # are 1-11 GB each. Frames are already extracted so worker_src is no longer needed.
+        ssh(worker["ssh_alias"], f"rm -f {shlex.quote(worker_src)}")
         set_status(job["id"], frame_count=idx, progress=min(99, idx * 100 // total_frames_est))
+    # Trim once per job so LVM thin pool on the host actually reclaims the freed blocks.
+    ssh(worker["ssh_alias"], "sudo fstrim / 2>/dev/null || fstrim / 2>/dev/null", timeout=60)
     return frames_dir