stage03b: trim videos per run + ours rough cut

LRV proxy (GoPro low-res 768x432 H.264) + ffmpeg -c copy keyframe-aligned.
Inputs: 02_runs.json + 03_video_index.json.
Outputs: per-run mp4 + ours_<gp>.mp4 chrono concat.
Tested on 20260505-Lepradet: 5 files + 2 ours (~11 GB total).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Poulpe
2026-05-16 16:05:41 +00:00
parent 754f3c7272
commit 171f90ce9f

228
pipeline/stages/03b_trim_runs.py Executable file
View File

@@ -0,0 +1,228 @@
#!/usr/bin/env python3
"""Stage 03b - Trim videos per run (LRV proxies + -c copy, fast).
Inputs:
data/<MISSION>/02_runs.json
data/<MISSION>/03_video_index.json
Strategy:
- Use GoPro LRV proxy files (768x432 H.264 ~720 kbps) instead of 4K HEVC originals.
- ffmpeg -c copy per chapter (keyframe-aligned cut) + concat demuxer.
- Output: per-run .mp4 + ours_<gp>.mp4 (concat of per-run).
Falls back to MP4 source if matching LRV is missing.
"""
from __future__ import annotations
import argparse
import json
import os
import shutil
import subprocess
import sys
import tempfile
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
OUT_ROOT = Path("/mnt/ssd/cosma-qc-out/03b_trim_runs")
def run_ff(cmd: list[str]) -> None:
r = subprocess.run(cmd, capture_output=True, text=True)
if r.returncode != 0:
sys.stderr.write(" ".join(cmd) + "\n")
sys.stderr.write(r.stderr[-3000:] + "\n")
raise RuntimeError(f"ffmpeg failed rc={r.returncode}")
def lrv_for_chapter(mp4_path: Path) -> Path | None:
"""Return matching .LRV path if it exists (GoPro low-res proxy)."""
name = mp4_path.name
if not name.startswith("GX") or not name.upper().endswith(".MP4"):
return None
lrv_name = "GL" + name[2:-4] + ".LRV"
p = mp4_path.parent / lrv_name
return p if p.exists() else None
def overlap_clips(run_start: float, run_end: float, chapters: list[dict]) -> list[tuple[dict, float, float]]:
"""Return [(chapter, start_off_s, duration_s)] for chapters overlapping the run."""
out = []
for ch in sorted(chapters, key=lambda c: c["start_epoch"]):
a = max(run_start, ch["start_epoch"])
b = min(run_end, ch["end_epoch"])
if b - a <= 1.0:
continue
start_off = max(0.0, run_start - ch["start_epoch"])
dur = b - a
out.append((ch, start_off, dur))
return out
def cut_clip(src: Path, start_off: float, duration: float, dst: Path) -> None:
"""Cut [start_off, start_off+duration] from src using -c copy (keyframe-aligned)."""
cmd = [
"ffmpeg", "-y", "-loglevel", "error",
"-ss", f"{start_off:.3f}",
"-i", str(src),
"-t", f"{duration:.3f}",
"-c", "copy",
"-avoid_negative_ts", "make_zero",
"-an",
str(dst),
]
run_ff(cmd)
def concat_demux(parts: list[Path], dst: Path) -> None:
"""Concat parts with ffmpeg concat demuxer (-c copy)."""
if not parts:
return
if len(parts) == 1:
shutil.copy2(parts[0], dst)
return
with tempfile.NamedTemporaryFile("w", suffix=".txt", delete=False) as f:
for p in parts:
f.write(f"file '{p.resolve()}'\n")
listfile = f.name
try:
cmd = [
"ffmpeg", "-y", "-loglevel", "error",
"-f", "concat", "-safe", "0",
"-i", listfile,
"-c", "copy",
"-movflags", "+faststart",
"-an",
str(dst),
]
run_ff(cmd)
finally:
os.unlink(listfile)
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--mission", required=True)
ap.add_argument("--data-root", default="/home/cosma/cosma-qc/data")
ap.add_argument("--skip-existing", action="store_true")
ap.add_argument("--prefer-source", choices=["lrv", "mp4"], default="lrv",
help="lrv = use .LRV proxy (default, fast); mp4 = use 4K originals")
args = ap.parse_args()
mission = args.mission
data_dir = Path(args.data_root) / mission
runs = json.loads((data_dir / "02_runs.json").read_text())["runs"]
vidx = json.loads((data_dir / "03_video_index.json").read_text())
videos = vidx["videos"]
by_auv_gp: dict[tuple[str, str], list[dict]] = defaultdict(list)
for v in videos:
by_auv_gp[(v["auv"], v["gp"])].append(v)
all_gps = sorted({v["gp"] for v in videos})
out_dir = OUT_ROOT / mission
out_dir.mkdir(parents=True, exist_ok=True)
tmp_dir = out_dir / "_tmp"
tmp_dir.mkdir(exist_ok=True)
link = data_dir / "03b_trim_runs"
if link.is_symlink():
link.unlink()
elif link.exists():
shutil.rmtree(link)
link.symlink_to(out_dir)
manifest = {
"mission": mission,
"generated_at": datetime.now(timezone.utc).isoformat(),
"output_root": str(out_dir),
"source": args.prefer_source,
"runs": [],
"ours": {},
}
runs_by_chrono = sorted(runs, key=lambda r: r["start_epoch"])
ours_parts: dict[str, list[tuple[float, Path, str]]] = defaultdict(list)
for run in runs_by_chrono:
run_id = run["run_id"]
auv = run["auv"]
r_start = run["start_epoch"]
r_end = run["end_epoch"]
run_entry = {"run_id": run_id, "auv": auv, "duration_s": run["duration_s"], "outputs": []}
for gp in all_gps:
chapters = by_auv_gp.get((auv, gp), [])
if not chapters:
continue
clips = overlap_clips(r_start, r_end, chapters)
if not clips:
continue
out_name = f"{run_id}_{auv}_{gp}.mp4"
out_path = out_dir / out_name
if args.skip_existing and out_path.exists() and out_path.stat().st_size > 0:
print(f"[skip] {out_name}", flush=True)
else:
# Pick source per chapter
resolved: list[tuple[Path, float, float, str]] = []
src_tags: list[str] = []
for ch, soff, dur in clips:
mp4 = Path(ch["filepath"])
src = mp4
tag = "mp4"
if args.prefer_source == "lrv":
lrv = lrv_for_chapter(mp4)
if lrv:
src = lrv
tag = "lrv"
resolved.append((src, soff, dur, tag))
src_tags.append(tag)
print(
f"[cut ] {out_name} chapters={len(resolved)} src={','.join(src_tags)}",
flush=True,
)
tmp_parts: list[Path] = []
for i, (src, soff, dur, _) in enumerate(resolved):
tp = tmp_dir / f"{run_id}_{auv}_{gp}_p{i:02d}.mp4"
cut_clip(src, soff, dur, tp)
tmp_parts.append(tp)
concat_demux(tmp_parts, out_path)
for p in tmp_parts:
p.unlink(missing_ok=True)
sz_mb = round(out_path.stat().st_size / 1024 / 1024, 1)
run_entry["outputs"].append({"gp": gp, "file": out_name, "size_mb": sz_mb})
ours_parts[gp].append((r_start, out_path, f"{run_id} {auv}"))
manifest["runs"].append(run_entry)
for gp, parts in ours_parts.items():
parts.sort()
ordered_paths = [p for _, p, _ in parts]
ours_path = out_dir / f"ours_{gp}.mp4"
print(f"[ours] {ours_path.name} <- {len(ordered_paths)} clip(s)", flush=True)
concat_demux(ordered_paths, ours_path)
sz_mb = round(ours_path.stat().st_size / 1024 / 1024, 1)
manifest["ours"][gp] = {
"file": ours_path.name,
"size_mb": sz_mb,
"segments": [lbl for _, _, lbl in parts],
}
(data_dir / "03b_trim_runs.json").write_text(json.dumps(manifest, indent=2))
print(f"\n[done] manifest: {data_dir / '03b_trim_runs.json'}", flush=True)
print(f"[done] outputs: {out_dir}", flush=True)
try:
tmp_dir.rmdir()
except OSError:
pass
if __name__ == "__main__":
main()