dispatcher clean frames avant extract + budget RAM 0.55 -> 0.45
Bugs decouverts en live: 1. Les retries/restarts ne cleanaient pas frames_dir -> ffmpeg re-extrayait par dessus les anciennes -> frame_count inflate (ex: 21991 au lieu de 11000) -> budget stride fausse -> OOM. 2. Budget 0.55*RAM laissait pas assez de headroom (OS + CUDA pinned buffers + autres processes) -> kill -9 a la limite. 0.45 plus conservateur.
This commit is contained in:
@@ -184,7 +184,9 @@ def video_duration_s(worker: dict, worker_src: str) -> float:
|
|||||||
def do_extract(job: sqlite3.Row, worker: dict) -> str:
|
def do_extract(job: sqlite3.Row, worker: dict) -> str:
|
||||||
videos = json.loads(job["video_paths"])
|
videos = json.loads(job["video_paths"])
|
||||||
frames_dir = f"{worker['frames_dir']}/job_{job['id']}"
|
frames_dir = f"{worker['frames_dir']}/job_{job['id']}"
|
||||||
ssh(worker["ssh_alias"], f"mkdir -p {shlex.quote(frames_dir)}")
|
# Clean any frame_*.jpg from a prior run so count_frames reflects this extraction only
|
||||||
|
# (retries/restarts otherwise inflate frame_count with duplicates).
|
||||||
|
ssh(worker["ssh_alias"], f"mkdir -p {shlex.quote(frames_dir)} && rm -f {shlex.quote(frames_dir)}/frame_*.jpg")
|
||||||
idx = 0
|
idx = 0
|
||||||
total_frames_est = 0 # will be computed after each scp
|
total_frames_est = 0 # will be computed after each scp
|
||||||
for v in videos:
|
for v in videos:
|
||||||
@@ -243,7 +245,7 @@ def do_reconstruct(job: sqlite3.Row, worker: dict, frames_dir: str) -> tuple[str
|
|||||||
# .87 has 23 GB RAM, .84 has 62 GB. Keep effective frame count ~4k to stay safe.
|
# .87 has 23 GB RAM, .84 has 62 GB. Keep effective frame count ~4k to stay safe.
|
||||||
frame_count = job["frame_count"] or 0
|
frame_count = job["frame_count"] or 0
|
||||||
ram_gb = 23 if worker["host"] == "192.168.0.87" else 62
|
ram_gb = 23 if worker["host"] == "192.168.0.87" else 62
|
||||||
ram_budget_gb = ram_gb * 0.55
|
ram_budget_gb = ram_gb * 0.45 # leave headroom for model + OS + cuda pinned buffers
|
||||||
stride = 1
|
stride = 1
|
||||||
while frame_count * 3.15 / 1024 / stride > ram_budget_gb:
|
while frame_count * 3.15 / 1024 / stride > ram_budget_gb:
|
||||||
stride += 1
|
stride += 1
|
||||||
|
|||||||
Reference in New Issue
Block a user