import os, re, json, h5py
from pathlib import Path
from datetime import datetime
from tqdm import tqdm

DATA_ROOTS = [Path("/mnt/kingston"), Path("/mnt/data_sdb1")]
POSITIONS_CSV = Path("/mnt/kingston/Copie de SETE_AUV_DARFV4-Copier(1).csv")
OUTPUT_INDEX = Path("/mnt/kingston/seismic_webapp/data/index.json")
SAMPLE_RATE = 200

def load_pos():
    positions = {}
    if not POSITIONS_CSV.exists(): return {}
    with open(POSITIONS_CSV, 'r', encoding='utf-8', errors='replace') as f:
        lines = f.readlines()
        if len(lines) < 5: return {}
        headers = lines[3].strip().split(',')
        try:
            ni = headers.index('NodeCode')
            ei = headers.index('Aslaid Easting') if 'Aslaid Easting' in headers else headers.index('Preplot Easting')
            oi = headers.index('Aslaid Northing') if 'Aslaid Northing' in headers else headers.index('Preplot Northing')
            di = headers.index('Aslaid Depth') if 'Aslaid Depth' in headers else -1
        except: return {}
        for line in lines[4:]:
            parts = line.strip().split(',')
            try:
                nid = parts[ni].strip()
                positions[nid] = {
                    'easting': float(parts[ei]), 
                    'northing': float(parts[oi]), 
                    'depth': float(parts[di]) if di != -1 else 0.0
                }
            except: continue
    return positions

def scan():
    pos = load_pos()
    nodes = {}
    all_dates = set()
    file_count = 0
    
    print("🔍 Scanning all H5 files...")
    all_h5_files = []
    for root in DATA_ROOTS:
        all_h5_files.extend(list(root.rglob("*.h5")))
    
    for h5_path in tqdm(all_h5_files):
        try:
            match = re.search(r'_b(\d+)_', h5_path.name)
            if not match: continue
            nid = match.group(1)
            
            with h5py.File(h5_path, 'r') as f:
                if 'adc_values' not in f: continue
                ds = f['adc_values']
                start_ts = int(ds.attrs.get('timestamp', 0))
                if start_ts == 0: continue
                
                duration = ds.shape[0] / SAMPLE_RATE
                end_ts = start_ts + duration
                
                # Ajouter la date à la liste globale
                date_str = datetime.fromtimestamp(start_ts).strftime('%Y-%m-%d')
                all_dates.add(date_str)
                
                if nid not in nodes:
                    nodes[nid] = {
                        'id': nid,
                        'position': pos.get(nid),
                        'files': []
                    }
                
                nodes[nid]['files'].append({
                    'path': str(h5_path),
                    'start': start_ts,
                    'end': end_ts,
                    'channels': ['ch0', 'ch1', 'ch2', 'ch3']
                })
                file_count += 1
        except: continue

    # Sauvegarder l'index complet
    result = {
        'generated_at': datetime.now().isoformat(),
        'sample_rate_hz': SAMPLE_RATE,
        'nodes': nodes,
        'dates': sorted(list(all_dates))
    }
    
    with open(OUTPUT_INDEX, 'w') as f:
        json.dump(result, f, indent=2)
    
    print(f"✅ Indexing complete: {file_count} files, {len(nodes)} nodes, {len(all_dates)} dates.")
    print(f"📅 Dates covered: {sorted(list(all_dates))}")

if __name__ == '__main__': scan()