ingest+dispatcher — support acquisition depuis remote host via SSH

- ingest.py : --remote-host <alias> pour scanner/exiftool via SSH, stocke
  les chemins avec préfixe "alias:" pour que le worker sache puller direct
- dispatcher.py : scp_to_worker détecte "host:path" et fait pull remote
  (worker → source host) au lieu du double hop via dispatcher
- _path_basename gère les paths préfixés pour ffmpeg

Permet d'ingester les vidéos depuis n'importe quelle machine accessible
en SSH sans passer 145GB par le conteneur FastAPI.
This commit is contained in:
Poulpe
2026-04-21 13:31:40 +00:00
parent 468f9084ec
commit 192550b60b
2 changed files with 64 additions and 16 deletions

View File

@@ -27,13 +27,23 @@ from pathlib import Path
DB_PATH = Path(os.environ.get("COSMA_QC_DB", "/var/lib/cosma-qc/jobs.db"))
FOLDER_RE = re.compile(r"GP(?P<gopro>\d+)_AUV(?P<auv>\d+)", re.I)
REMOTE_HOST: str | None = None # set via --remote-host
def _run_cmd(args: list[str], timeout: int = 10) -> str:
"""Run a command locally or on REMOTE_HOST via SSH."""
if REMOTE_HOST:
import shlex as _shlex
remote_cmd = " ".join(_shlex.quote(a) for a in args)
args = ["ssh", "-o", "BatchMode=yes", REMOTE_HOST, remote_cmd]
return subprocess.check_output(args, stderr=subprocess.DEVNULL, text=True, timeout=timeout).strip()
def exif_create_date(path: Path) -> datetime | None:
try:
out = subprocess.check_output(
out = _run_cmd(
["exiftool", "-s3", "-CreateDate", "-api", "QuickTimeUTC=1", str(path)],
stderr=subprocess.DEVNULL, text=True, timeout=10,
).strip()
)
if not out:
return None
# Strip timezone suffix (+HH:MM or -HH:MM) if present
@@ -46,10 +56,7 @@ def exif_create_date(path: Path) -> datetime | None:
def exif_duration_s(path: Path) -> float | None:
try:
out = subprocess.check_output(
["exiftool", "-s3", "-Duration#", str(path)],
stderr=subprocess.DEVNULL, text=True, timeout=10,
).strip()
out = _run_cmd(["exiftool", "-s3", "-Duration#", str(path)])
return float(out) if out else None
except Exception:
return None
@@ -57,10 +64,9 @@ def exif_duration_s(path: Path) -> float | None:
def exif_serial(path: Path) -> str | None:
try:
out = subprocess.check_output(
out = _run_cmd(
["exiftool", "-s3", "-SerialNumber", "-CameraSerialNumber", str(path)],
stderr=subprocess.DEVNULL, text=True, timeout=10,
).strip().splitlines()
).splitlines()
for line in out:
line = line.strip()
if line:
@@ -88,18 +94,31 @@ def group_segments(videos: list[dict], gap_min: int) -> list[dict]:
for seg in segments:
start = seg[0]["start"]
end = seg[-1]["start"] + timedelta(seconds=seg[-1]["duration"] or 0)
prefix = f"{REMOTE_HOST}:" if REMOTE_HOST else ""
out.append({
"start": start, "end": end,
"label": f"{start.strftime('%H:%M')}{end.strftime('%H:%M')}",
"videos": [str(v["path"]) for v in seg],
"videos": [prefix + str(v["path"]) for v in seg],
})
return out
def _list_mp4s(root: Path) -> list[Path]:
if REMOTE_HOST:
import shlex as _shlex
out = subprocess.check_output(
["ssh", "-o", "BatchMode=yes", REMOTE_HOST,
f"find {_shlex.quote(str(root))} -type f -iname '*.MP4'"],
text=True, timeout=60,
)
return [Path(l.strip()) for l in out.splitlines() if l.strip()]
return list(root.rglob("*.MP4"))
def scan(root: Path) -> dict:
"""Return {(auv, gopro_tag): {serial, videos[]}}"""
grouped: dict[tuple[str, str], dict] = {}
for mp4 in root.rglob("*.MP4"):
for mp4 in _list_mp4s(root):
m = FOLDER_RE.search(str(mp4.parent))
if not m:
continue
@@ -124,12 +143,17 @@ def main():
ap.add_argument("--name", required=True, help="Acquisition name")
ap.add_argument("--gap-min", type=int, default=5, help="Max gap between videos in one segment")
ap.add_argument("--dry-run", action="store_true")
ap.add_argument("--remote-host", default=None,
help="SSH alias to read videos/exiftool from (stored paths get 'alias:' prefix)")
args = ap.parse_args()
if not args.root.exists():
global REMOTE_HOST
REMOTE_HOST = args.remote_host
if not REMOTE_HOST and not args.root.exists():
raise SystemExit(f"root not found: {args.root}")
print(f"Scanning {args.root}...")
print(f"Scanning {args.root}{' @ ' + REMOTE_HOST if REMOTE_HOST else ''}...")
grouped = scan(args.root)
if not grouped:
print("No (auv, gopro) folders found — expected GPx_AUVyyy layout."); return