dispatcher load balance 2 GPU (lower-load d abord) + fps=2 + debug pick_worker
pick_worker trie les candidats par: 1. nombre de jobs deja assignes sur le worker (moins d abord) 2. VRAM free (plus d abord) 3. hostname (tiebreaker sans comparer les dicts) Avant: le worker avec le plus de VRAM gagnait toujours (ex: .84 24GB vs .87 12GB) donc tous les jobs empilaient sur .84 pendant que .87 idle. fps=3 -> fps=2 via COSMA_QC_FPS dans dispatcher.env (cf user: 1 kt + reconstruction SfM -> 2 fps pas stride 6). Logs pick_worker ajoutes pour debug quand no candidate.
This commit is contained in:
@@ -103,25 +103,37 @@ def worker_free_vram_mib(worker: dict) -> int:
|
||||
return 0
|
||||
|
||||
|
||||
_jobs_per_worker: dict[str, int] = {}
|
||||
|
||||
|
||||
def pick_worker(estimated_vram_mib: int) -> dict | None:
|
||||
"""Pick worker with most effective free VRAM (actual free minus local reservations)
|
||||
and reserve the estimated VRAM. Returns None if none fit."""
|
||||
"""Pick a worker that (a) has enough free VRAM AND (b) is currently the least busy —
|
||||
otherwise the beefiest GPU always wins and the second worker sits idle."""
|
||||
with _worker_lock:
|
||||
best = None
|
||||
candidates = []
|
||||
for w in WORKERS:
|
||||
free = worker_free_vram_mib(w) - _reserved_vram.get(w["host"], 0)
|
||||
if free >= estimated_vram_mib and (best is None or free > best[0]):
|
||||
best = (free, w)
|
||||
if best:
|
||||
_reserved_vram[best[1]["host"]] = _reserved_vram.get(best[1]["host"], 0) + estimated_vram_mib
|
||||
return best[1]
|
||||
print(f" pick_worker: {w['host']} free={free} reserved={_reserved_vram.get(w['host'], 0)} load={_jobs_per_worker.get(w['host'], 0)}", flush=True)
|
||||
if free < estimated_vram_mib:
|
||||
continue
|
||||
load = _jobs_per_worker.get(w["host"], 0)
|
||||
# Sort key must avoid comparing dicts: use host as tiebreaker.
|
||||
candidates.append(((load, -free, w["host"]), w))
|
||||
if not candidates:
|
||||
print(f" pick_worker: no candidate for {estimated_vram_mib} MiB", flush=True)
|
||||
return None
|
||||
candidates.sort(key=lambda c: c[0])
|
||||
w = candidates[0][1]
|
||||
_reserved_vram[w["host"]] = _reserved_vram.get(w["host"], 0) + estimated_vram_mib
|
||||
_jobs_per_worker[w["host"]] = _jobs_per_worker.get(w["host"], 0) + 1
|
||||
return w
|
||||
|
||||
|
||||
def release_worker(worker: dict, estimated_vram_mib: int):
|
||||
with _worker_lock:
|
||||
h = worker["host"]
|
||||
_reserved_vram[h] = max(0, _reserved_vram.get(h, 0) - estimated_vram_mib)
|
||||
_jobs_per_worker[h] = max(0, _jobs_per_worker.get(h, 0) - 1)
|
||||
|
||||
|
||||
def estimate_vram_mib(frame_count: int) -> int:
|
||||
|
||||
Reference in New Issue
Block a user