import os, re, json, h5py from pathlib import Path from datetime import datetime, timedelta from tqdm import tqdm DATA_ROOTS = [Path("/mnt/kingston"), Path("/mnt/data_sdb1")] POSITIONS_CSV = Path("/mnt/kingston/Copie de SETE_AUV_DARFV4-Copier(1).csv") OUTPUT_INDEX = Path("/mnt/kingston/seismic_webapp/data/index.json") SAMPLE_RATE = 200 def load_pos(): positions = {} if not POSITIONS_CSV.exists(): return {} with open(POSITIONS_CSV, 'r', encoding='utf-8', errors='replace') as f: lines = f.readlines() if len(lines) < 5: return {} headers = lines[3].strip().split(',') try: ni = headers.index('NodeCode') ei = headers.index('Aslaid Easting') if 'Aslaid Easting' in headers else headers.index('Preplot Easting') oi = headers.index('Aslaid Northing') if 'Aslaid Northing' in headers else headers.index('Preplot Northing') except: return {} for line in lines[4:]: parts = line.strip().split(',') try: nid = parts[ni].strip() positions[nid] = { 'easting': float(parts[ei]), 'northing': float(parts[oi]), 'depth': float(parts[headers.index('Aslaid Depth')]) if 'Aslaid Depth' in headers else 0.0 } except: continue return positions def scan(): pos = load_pos() nodes = {} all_dates = set() file_count = 0 print("🔍 Scanning ONLY 'data' H5 files (ignoring 'aux')...") all_h5_files = [] for root in DATA_ROOTS: all_h5_files.extend(list(root.rglob("*.h5"))) for h5_path in tqdm(all_h5_files): # FILTRE : Uniquement les fichiers contenant "data" if "_data_" not in h5_path.name.lower(): continue try: match = re.search(r'auto_(\d+)_(\d{6})_b(\d+)_.*?_(\d{10})\.h5$', h5_path.name) if not match: continue julian_day = int(match.group(1)) time_str = match.group(2) node_id = match.group(3) date_ref = datetime(2020, 1, 1) + timedelta(days=julian_day - 1) date_str = date_ref.strftime('%Y-%m-%d') h, m, s = int(time_str[:2]), int(time_str[2:4]), int(time_str[4:6]) actual_start_ts = int(datetime(2020, 1, 1).timestamp() + (julian_day - 1) * 86400 + h * 3600 + m * 60 + s) with h5py.File(h5_path, 'r') as f: if 'adc_values' not in f: continue duration = f['adc_values'].shape[0] / SAMPLE_RATE actual_end_ts = actual_start_ts + duration all_dates.add(date_str) if node_id not in nodes: nodes[node_id] = { 'id': node_id, 'position': pos.get(node_id), 'files': [] } # On extrait le canal du nom de fichier pour un matching plus précis channel_match = re.search(r'_ch(\d+)_', h5_path.name) channel = f"ch{channel_match.group(1)}" if channel_match else "ch0" nodes[node_id]['files'].append({ 'path': str(h5_path), 'start': actual_start_ts, 'end': actual_end_ts, 'julian': julian_day, 'channel': channel # Canal spécifique au fichier }) file_count += 1 except: continue result = { 'generated_at': datetime.now().isoformat(), 'sample_rate_hz': SAMPLE_RATE, 'nodes': nodes, 'dates': sorted(list(all_dates)) } with open(OUTPUT_INDEX, 'w') as f: json.dump(result, f, indent=2) print(f"✅ Index updated: {file_count} 'data' files, {len(nodes)} nodes.") if __name__ == '__main__': scan()