import os, re, json, h5py
from pathlib import Path
from datetime import datetime
from tqdm import tqdm

DATA_ROOTS = [Path("/mnt/kingston"), Path("/mnt/data_sdb1")]
OUTPUT_INDEX = Path("/mnt/kingston/seismic_webapp/data/index.json")
SAMPLE_RATE = 200

def scan():
    index = {}
    file_count = 0
    print("Scanning H5 files for time ranges...")
    
    # On récupère d'abord les fichiers
    all_files = []
    for root in DATA_ROOTS:
        all_files.extend(list(root.rglob("*.h5")))
    
    for h5_path in tqdm(all_files):
        try:
            # On extrait ID node du nom de fichier
            match = re.search(r'_b(\d+)_', h5_path.name)
            if not match: continue
            nid = match.group(1)
            
            # On ouvre le fichier pour avoir le VRAI timestamp et la durée
            with h5py.File(h5_path, 'r') as f:
                if 'adc_values' not in f: continue
                ds = f['adc_values']
                start_ts = int(ds.attrs.get('timestamp', 0))
                if start_ts == 0: continue
                
                duration = ds.shape[0] / SAMPLE_RATE
                end_ts = start_ts + duration
                
                if nid not in index: index[nid] = []
                index[nid].append({
                    'path': str(h5_path),
                    'start': start_ts,
                    'end': end_ts,
                    'channels': ['ch0', 'ch1', 'ch2', 'ch3']
                })
                file_count += 1
        except: continue

    # Sauvegarder l'index
    with open(OUTPUT_INDEX, 'w') as f:
        json.dump({
            'generated_at': datetime.now().isoformat(),
            'sample_rate_hz': SAMPLE_RATE,
            'files_by_node': index
        }, f)
    
    print(f"Index généré: {file_count} fichiers avec plages temporelles réelles.")

if __name__ == '__main__': scan()