stage01: ajoute collectors MAG (cosmag csv ar/av/side) + SSS (Kogger klf/bin)
Manifest 01 enrichi avec deux nouveaux champs:
- sss_files: {klf,bin} relatifs sous raw_data/sss/**
- mag_files: {ar,av,side,other} csv sous raw_data/mag/**
Tests:
- 20260505-Lepradet: sss=1 mag=0 (mag vide attendu)
- 20260508-sttropez: sss=53 mag=73 (match scout)
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
304
pipeline/stages/01_select_mission.py
Executable file
304
pipeline/stages/01_select_mission.py
Executable file
@@ -0,0 +1,304 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Stage 01 — Select mission and produce a raw manifest.
|
||||
|
||||
Scans `<ssd_base>/<mission>/raw_data/` and writes
|
||||
`<out>/<mission>/01_manifest.json` listing all AUVs, GoPro folders, MCAP bags,
|
||||
BIN files, USBL logs.
|
||||
|
||||
Usage:
|
||||
python3 01_select_mission.py \
|
||||
--mission 20260505-Lepradet \
|
||||
--ssd-base /mnt/ssd \
|
||||
--out /home/cosma/cosma-qc/data
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
# Local helper for stage time/memory tracking
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
from _meta import track_stage # noqa: E402
|
||||
|
||||
# AUV physical (GoPro) and AUV MCAP IDs — pattern matching only, never write to SSD.
|
||||
AUV_PHYS_RE = re.compile(r"AUV(\d{3})", re.I)
|
||||
GP_FOLDER_RE = re.compile(r"^GP(?P<gp>\d+)[_-]AUV(?P<auv>\d{3})$", re.I)
|
||||
BAG_AUV_RE = re.compile(r"_AUV(?P<auv>\d{3})(?:[_/]|$)", re.I)
|
||||
USBL_FILE_RE = re.compile(r"usbl", re.I)
|
||||
|
||||
|
||||
def iso_utc_now() -> str:
|
||||
return datetime.now(timezone.utc).isoformat(timespec="seconds")
|
||||
|
||||
|
||||
def safe_listdir(p: Path) -> list[Path]:
|
||||
if not p.exists() or not p.is_dir():
|
||||
return []
|
||||
try:
|
||||
return sorted(p.iterdir())
|
||||
except OSError:
|
||||
return []
|
||||
|
||||
|
||||
def collect_videos(ssd_path: Path) -> tuple[dict[str, dict[str, list[str]]], int, float]:
|
||||
"""videos[AUV][GP1|GP2] -> sorted list of MP4 filenames. Two layouts tried."""
|
||||
videos: dict[str, dict[str, list[str]]] = defaultdict(lambda: defaultdict(list))
|
||||
total_n = 0
|
||||
total_bytes = 0
|
||||
|
||||
# Layout A: medias/videos/GP{n}{_|-}AUV{xxx}/*.MP4
|
||||
vroot_a = ssd_path / "medias" / "videos"
|
||||
for sub in safe_listdir(vroot_a):
|
||||
if not sub.is_dir():
|
||||
continue
|
||||
m = GP_FOLDER_RE.match(sub.name)
|
||||
if not m:
|
||||
continue
|
||||
gp_key = f"GP{int(m.group('gp'))}"
|
||||
auv_id = f"AUV{int(m.group('auv')):03d}"
|
||||
for f in safe_listdir(sub):
|
||||
if f.is_file() and f.suffix.upper() == ".MP4":
|
||||
videos[auv_id][gp_key].append(f.name)
|
||||
total_n += 1
|
||||
try:
|
||||
total_bytes += f.stat().st_size
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# Layout B: medias/videos/AUV{xxx}/GP{n}/*.MP4 (fallback if anything found)
|
||||
for sub in safe_listdir(vroot_a):
|
||||
if not sub.is_dir():
|
||||
continue
|
||||
am = AUV_PHYS_RE.fullmatch(sub.name)
|
||||
if not am:
|
||||
continue
|
||||
auv_id = f"AUV{int(am.group(1)):03d}"
|
||||
for gp_sub in safe_listdir(sub):
|
||||
if not gp_sub.is_dir():
|
||||
continue
|
||||
gm = re.fullmatch(r"GP(\d+)", gp_sub.name, re.I)
|
||||
if not gm:
|
||||
continue
|
||||
gp_key = f"GP{int(gm.group(1))}"
|
||||
for f in safe_listdir(gp_sub):
|
||||
if f.is_file() and f.suffix.upper() == ".MP4":
|
||||
videos[auv_id][gp_key].append(f.name)
|
||||
total_n += 1
|
||||
try:
|
||||
total_bytes += f.stat().st_size
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# Sort + dedup
|
||||
out: dict[str, dict[str, list[str]]] = {}
|
||||
for auv in sorted(videos.keys()):
|
||||
out[auv] = {}
|
||||
for gp in sorted(videos[auv].keys()):
|
||||
out[auv][gp] = sorted(set(videos[auv][gp]))
|
||||
return out, total_n, total_bytes / 1e9
|
||||
|
||||
|
||||
def collect_mcap_bags(ssd_path: Path) -> tuple[dict[str, list[str]], int]:
|
||||
"""mcap_bags[AUV{nnn}] -> sorted list of bag-dir names."""
|
||||
bags: dict[str, list[str]] = defaultdict(list)
|
||||
n = 0
|
||||
broot = ssd_path / "logs" / "SUB" / "bag"
|
||||
for sub in safe_listdir(broot):
|
||||
if not sub.is_dir():
|
||||
continue
|
||||
m = BAG_AUV_RE.search(sub.name)
|
||||
if not m:
|
||||
continue
|
||||
auv_id = f"AUV{int(m.group('auv')):03d}"
|
||||
bags[auv_id].append(sub.name)
|
||||
n += 1
|
||||
return {k: sorted(set(v)) for k, v in sorted(bags.items())}, n
|
||||
|
||||
|
||||
def collect_bin_files(ssd_path: Path) -> tuple[dict[str, list[str]], int]:
|
||||
"""bin_files[AUV{nnn}] -> sorted .BIN filenames."""
|
||||
bins: dict[str, list[str]] = defaultdict(list)
|
||||
n = 0
|
||||
# Layout: logs/SUB/AUV{xxx}/*.BIN
|
||||
sub_root = ssd_path / "logs" / "SUB"
|
||||
for sub in safe_listdir(sub_root):
|
||||
if not sub.is_dir():
|
||||
continue
|
||||
am = AUV_PHYS_RE.fullmatch(sub.name)
|
||||
if not am:
|
||||
continue
|
||||
auv_id = f"AUV{int(am.group(1)):03d}"
|
||||
for f in safe_listdir(sub):
|
||||
if f.is_file() and f.suffix.upper() == ".BIN":
|
||||
bins[auv_id].append(f.name)
|
||||
n += 1
|
||||
# Also try logs/SUB/bin/*.BIN (flat layout)
|
||||
flat = ssd_path / "logs" / "SUB" / "bin"
|
||||
for f in safe_listdir(flat):
|
||||
if f.is_file() and f.suffix.upper() == ".BIN":
|
||||
bins.setdefault("AUV000", []).append(f.name)
|
||||
n += 1
|
||||
return {k: sorted(set(v)) for k, v in sorted(bins.items())}, n
|
||||
|
||||
|
||||
def collect_usbl_logs(ssd_path: Path) -> list[str]:
|
||||
"""Return list of USBL csv paths relative to ssd_path."""
|
||||
out = []
|
||||
logs_root = ssd_path / "logs"
|
||||
if not logs_root.exists():
|
||||
return out
|
||||
for p in logs_root.rglob("*"):
|
||||
try:
|
||||
if not p.is_file():
|
||||
continue
|
||||
except OSError:
|
||||
continue
|
||||
name_l = p.name.lower()
|
||||
if "usbl" in name_l and name_l.endswith(".csv"):
|
||||
try:
|
||||
out.append(str(p.relative_to(ssd_path)))
|
||||
except ValueError:
|
||||
out.append(str(p))
|
||||
return sorted(set(out))
|
||||
|
||||
|
||||
def collect_audio_logs(ssd_path: Path) -> list[str]:
|
||||
out = []
|
||||
aud = ssd_path / "audios"
|
||||
for p in aud.rglob("*") if aud.exists() else []:
|
||||
try:
|
||||
if p.is_file():
|
||||
out.append(str(p.relative_to(ssd_path)))
|
||||
except (OSError, ValueError):
|
||||
continue
|
||||
return sorted(out)
|
||||
|
||||
|
||||
|
||||
def collect_sss_files(ssd_path: Path) -> tuple[dict[str, list[str]], int]:
|
||||
"""klf + bin under ssd_path/sss/, recursive."""
|
||||
sss_root = ssd_path / "sss"
|
||||
klf: list[str] = []
|
||||
bin_: list[str] = []
|
||||
if sss_root.exists():
|
||||
try:
|
||||
for p in sss_root.rglob("*.klf"):
|
||||
try:
|
||||
if p.is_file():
|
||||
klf.append(str(p.relative_to(ssd_path)))
|
||||
except (OSError, ValueError):
|
||||
continue
|
||||
for p in sss_root.rglob("*.bin"):
|
||||
try:
|
||||
if p.is_file():
|
||||
bin_.append(str(p.relative_to(ssd_path)))
|
||||
except (OSError, ValueError):
|
||||
continue
|
||||
except OSError:
|
||||
pass
|
||||
result = {"klf": sorted(set(klf)), "bin": sorted(set(bin_))}
|
||||
return result, len(result["klf"]) + len(result["bin"])
|
||||
|
||||
|
||||
def collect_mag_files(ssd_path: Path) -> tuple[dict[str, list[str]], int]:
|
||||
"""csv under ssd_path/mag/, categorised ar/av/side/other."""
|
||||
mag_root = ssd_path / "mag"
|
||||
out: dict[str, list[str]] = {"ar": [], "av": [], "side": [], "other": []}
|
||||
if mag_root.exists():
|
||||
try:
|
||||
for p in mag_root.rglob("*.csv"):
|
||||
try:
|
||||
if not p.is_file():
|
||||
continue
|
||||
rel = str(p.relative_to(ssd_path))
|
||||
parts = p.parts
|
||||
if "ar" in parts:
|
||||
out["ar"].append(rel)
|
||||
elif "av" in parts:
|
||||
out["av"].append(rel)
|
||||
elif "side" in parts:
|
||||
out["side"].append(rel)
|
||||
else:
|
||||
out["other"].append(rel)
|
||||
except (OSError, ValueError):
|
||||
continue
|
||||
except OSError:
|
||||
pass
|
||||
result = {k: sorted(set(v)) for k, v in out.items()}
|
||||
total = sum(len(v) for v in result.values())
|
||||
return result, total
|
||||
|
||||
def build_manifest(mission: str, ssd_base: Path) -> dict:
|
||||
ssd_path = ssd_base / mission / "raw_data"
|
||||
if not ssd_path.exists():
|
||||
raise FileNotFoundError(f"raw_data not found: {ssd_path}")
|
||||
|
||||
videos, n_vids, total_gb = collect_videos(ssd_path)
|
||||
mcap_bags, n_bags = collect_mcap_bags(ssd_path)
|
||||
bin_files, n_bins = collect_bin_files(ssd_path)
|
||||
usbl_logs = collect_usbl_logs(ssd_path)
|
||||
audio_logs = collect_audio_logs(ssd_path)
|
||||
sss_files, n_sss = collect_sss_files(ssd_path)
|
||||
mag_files, n_mag = collect_mag_files(ssd_path)
|
||||
|
||||
return {
|
||||
"mission": mission,
|
||||
"generated_at": iso_utc_now(),
|
||||
"ssd_path": str(ssd_path),
|
||||
"videos": videos,
|
||||
"mcap_bags": mcap_bags,
|
||||
"bin_files": bin_files,
|
||||
"usbl_logs": usbl_logs,
|
||||
"audio_logs": audio_logs,
|
||||
"sss_files": sss_files,
|
||||
"mag_files": mag_files,
|
||||
"totals": {
|
||||
"n_videos": n_vids,
|
||||
"n_mcap_bags": n_bags,
|
||||
"n_bin_files": n_bins,
|
||||
"n_usbl_logs": len(usbl_logs),
|
||||
"n_audio_logs": len(audio_logs),
|
||||
"n_sss_files": n_sss,
|
||||
"n_mag_files": n_mag,
|
||||
"total_video_size_gb": round(total_gb, 2),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
ap = argparse.ArgumentParser(description="Stage 01 — select mission, produce raw manifest.")
|
||||
ap.add_argument("--mission", required=True, help="Mission folder name (e.g. 20260505-Lepradet)")
|
||||
ap.add_argument("--ssd-base", default="/mnt/ssd", help="SSD base path (default /mnt/ssd)")
|
||||
ap.add_argument("--out", default="/home/cosma/cosma-qc/data",
|
||||
help="Output base dir (manifest written to <out>/<mission>/01_manifest.json)")
|
||||
args = ap.parse_args(argv)
|
||||
|
||||
ssd_base = Path(args.ssd_base)
|
||||
out_base = Path(args.out)
|
||||
out_dir = out_base / args.mission
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
out_path = out_dir / "01_manifest.json"
|
||||
|
||||
with track_stage("01_select_mission", args.mission, out_dir,
|
||||
output_files=[out_path]):
|
||||
manifest = build_manifest(args.mission, ssd_base)
|
||||
tmp = out_path.with_suffix(".json.tmp")
|
||||
tmp.write_text(json.dumps(manifest, indent=2, ensure_ascii=False))
|
||||
tmp.replace(out_path)
|
||||
|
||||
t = manifest["totals"]
|
||||
print(f"[ok] {out_path}")
|
||||
print(f" videos={t['n_videos']} ({t['total_video_size_gb']} GB) "
|
||||
f"bags={t['n_mcap_bags']} bins={t['n_bin_files']} "
|
||||
f"usbl={t['n_usbl_logs']} audio={t['n_audio_logs']} "
|
||||
f"sss={t['n_sss_files']} mag={t['n_mag_files']}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user