cosma-qc/scripts/ingest.py

#!/usr/bin/env python3
"""Scan an acquisition directory, group GoPro MP4s into continuous segments,
and insert jobs into the cosma-qc DB.

Usage:
    python3 ingest.py /mnt/portablessd/COSMA-<date>/ --name "La Ciotat 8 avril" [--gap-min 5]

Directory layout expected (we saw this from the real SSD):

    <root>/media/gopro{1,2}/GP{1,2}_AUV{209,210}/GX*.MP4

The AUV tag and GoPro id come from folder names. The serial is read via
exiftool (falls back to folder name if unavailable). Continuous segments are
derived from EXIF CreateDate timestamps with a configurable gap threshold.
"""
from __future__ import annotations

import argparse
import json
import os
import re
import sqlite3
import subprocess
from datetime import datetime, timedelta
from pathlib import Path

DB_PATH = Path(os.environ.get("COSMA_QC_DB", "/var/lib/cosma-qc/jobs.db"))
FOLDER_RE = re.compile(r"GP(?P<gopro>\d+)_AUV(?P<auv>\d+)", re.I)


def exif_create_date(path: Path) -> datetime | None:
    try:
        out = subprocess.check_output(
            ["exiftool", "-s3", "-CreateDate", "-api", "QuickTimeUTC=1", str(path)],
            stderr=subprocess.DEVNULL, text=True, timeout=10,
        ).strip()
        if not out:
            return None
        # Strip timezone suffix (+HH:MM or -HH:MM) if present
        import re as _re
        out = _re.sub(r'[+-]\d{2}:\d{2}$', '', out).strip()
        return datetime.strptime(out, "%Y:%m:%d %H:%M:%S")
    except Exception:
        return None


def exif_duration_s(path: Path) -> float | None:
    try:
        out = subprocess.check_output(
            ["exiftool", "-s3", "-Duration#", str(path)],
            stderr=subprocess.DEVNULL, text=True, timeout=10,
        ).strip()
        return float(out) if out else None
    except Exception:
        return None


def exif_serial(path: Path) -> str | None:
    try:
        out = subprocess.check_output(
            ["exiftool", "-s3", "-SerialNumber", "-CameraSerialNumber", str(path)],
            stderr=subprocess.DEVNULL, text=True, timeout=10,
        ).strip().splitlines()
        for line in out:
            line = line.strip()
            if line:
                return line
    except Exception:
        pass
    return None


def group_segments(videos: list[dict], gap_min: int) -> list[dict]:
    """Group consecutive videos into segments when gap between end-of-A and
    start-of-B is below `gap_min` minutes."""
    videos = sorted(videos, key=lambda v: v["start"])
    segments: list[list[dict]] = []
    for v in videos:
        if not segments:
            segments.append([v]); continue
        last = segments[-1][-1]
        last_end = last["start"] + timedelta(seconds=last["duration"] or 0)
        if (v["start"] - last_end) <= timedelta(minutes=gap_min):
            segments[-1].append(v)
        else:
            segments.append([v])
    out = []
    for seg in segments:
        start = seg[0]["start"]
        end = seg[-1]["start"] + timedelta(seconds=seg[-1]["duration"] or 0)
        out.append({
            "start": start, "end": end,
            "label": f"{start.strftime('%H:%M')}–{end.strftime('%H:%M')}",
            "videos": [str(v["path"]) for v in seg],
        })
    return out


def scan(root: Path) -> dict:
    """Return {(auv, gopro_tag): {serial, videos[]}}"""
    grouped: dict[tuple[str, str], dict] = {}
    for mp4 in root.rglob("*.MP4"):
        m = FOLDER_RE.search(str(mp4.parent))
        if not m:
            continue
        auv = f"AUV{m.group('auv')}"
        gopro_tag = f"GP{m.group('gopro')}"
        key = (auv, gopro_tag)
        start = exif_create_date(mp4)
        dur = exif_duration_s(mp4)
        if not start:
            print(f"  [skip] no CreateDate: {mp4}"); continue
        serial = exif_serial(mp4)
        slot = grouped.setdefault(key, {"serial": serial, "videos": []})
        if serial and not slot["serial"]:
            slot["serial"] = serial
        slot["videos"].append({"path": mp4, "start": start, "duration": dur or 0})
    return grouped


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("root", type=Path)
    ap.add_argument("--name", required=True, help="Acquisition name")
    ap.add_argument("--gap-min", type=int, default=5, help="Max gap between videos in one segment")
    ap.add_argument("--dry-run", action="store_true")
    args = ap.parse_args()

    if not args.root.exists():
        raise SystemExit(f"root not found: {args.root}")

    print(f"Scanning {args.root}...")
    grouped = scan(args.root)
    if not grouped:
        print("No (auv, gopro) folders found — expected GPx_AUVyyy layout."); return

    DB_PATH.parent.mkdir(parents=True, exist_ok=True)
    conn = sqlite3.connect(DB_PATH, isolation_level=None)
    conn.execute("PRAGMA foreign_keys=ON")
    conn.row_factory = sqlite3.Row

    if args.dry_run:
        acq_id = -1
    else:
        cur = conn.execute(
            "INSERT INTO acquisitions (name, source_path) VALUES (?, ?)",
            (args.name, str(args.root)),
        )
        acq_id = cur.lastrowid
        print(f"Created acquisition id={acq_id}")

    total_jobs = 0
    for (auv, gopro_tag), info in sorted(grouped.items()):
        serial = info["serial"] or gopro_tag
        segs = group_segments(info["videos"], args.gap_min)
        print(f"\n{auv} / {gopro_tag} (serial={serial}) — {len(info['videos'])} videos → {len(segs)} segments")
        for seg in segs:
            dur_min = (seg["end"] - seg["start"]).total_seconds() / 60
            print(f"  · {seg['label']}  ({dur_min:.1f} min, {len(seg['videos'])} files)")
            if args.dry_run:
                continue
            conn.execute("""
                INSERT INTO jobs (acquisition_id, auv, gopro_serial, segment_label,
                                  video_paths, status)
                VALUES (?, ?, ?, ?, ?, 'queued')
            """, (acq_id, auv, serial, seg["label"], json.dumps(seg["videos"])))
            total_jobs += 1

    print(f"\nInserted {total_jobs} jobs.")


if __name__ == "__main__":
    main()