"""
Script d'inventaire des fichiers HDF5.
Extrait les timestamps des noms de fichiers et génère un rapport.
"""

import os
import re
from pathlib import Path
from datetime import datetime
from collections import defaultdict

# Dossiers racine
DATA_ROOTS = [
    Path(r"F:\2020-09-12"),
    Path(r"F:\2020-09-13"),
    Path(r"F:\2020-09-14"),
    Path(r"F:\2020-09-15"),
    Path(r"F:\2020-09-16"),
    Path(r"F:\2020-09-17"),
    Path(r"F:\2020-09-18"),
    Path(r"F:\2020-09-19"),
    Path(r"F:\2020-09-21"),
    Path(r"F:\2020-09-22"),
    Path(r"F:\2020-09-23"),
]

# Pattern pour extraire node_id et timestamp
# Exemple: auto_256_070617_b67_14_025708_data_rsn6027_seq1_ch0_1599057453.h5
PATTERN = re.compile(r'_b(\d+)_.*?(\d{10})\.h5$', re.IGNORECASE)


def main():
    print("=" * 70)
    print("INVENTAIRE DES FICHIERS HDF5")
    print("=" * 70)
    
    # Structure: folder -> node_id -> list of (timestamp, filename, type)
    inventory = defaultdict(lambda: defaultdict(list))
    
    # Stats globales
    total_files = 0
    total_size = 0
    nodes_set = set()
    timestamps_set = set()
    
    for root in DATA_ROOTS:
        if not root.exists():
            continue
        
        folder_name = root.name
        
        for h5_file in root.rglob("*.h5"):
            match = PATTERN.search(h5_file.name)
            if not match:
                continue
            
            node_id = match.group(1)
            timestamp = int(match.group(2))
            
            # Déterminer le type (data ou aux)
            file_type = "data" if "_data_" in h5_file.name else "aux" if "_aux_" in h5_file.name else "unknown"
            
            # Extraire le channel si présent
            ch_match = re.search(r'_ch(\d+)_', h5_file.name)
            channel = f"ch{ch_match.group(1)}" if ch_match else "?"
            
            file_size = h5_file.stat().st_size
            
            inventory[folder_name][node_id].append({
                'timestamp': timestamp,
                'datetime': datetime.fromtimestamp(timestamp),
                'type': file_type,
                'channel': channel,
                'filename': h5_file.name,
                'size': file_size
            })
            
            total_files += 1
            total_size += file_size
            nodes_set.add(node_id)
            timestamps_set.add(timestamp)
    
    # Rapport par dossier
    print(f"\n{'DOSSIER':<15} {'NODES':<10} {'FICHIERS':<10} {'TAILLE':<15}")
    print("-" * 50)
    
    for folder in sorted(inventory.keys()):
        folder_data = inventory[folder]
        n_nodes = len(folder_data)
        n_files = sum(len(files) for files in folder_data.values())
        folder_size = sum(f['size'] for files in folder_data.values() for f in files)
        print(f"{folder:<15} {n_nodes:<10} {n_files:<10} {folder_size / 1e9:.2f} GB")
    
    # Stats globales
    print("\n" + "=" * 70)
    print("STATISTIQUES GLOBALES")
    print("=" * 70)
    print(f"Fichiers H5 totaux: {total_files}")
    print(f"Taille totale: {total_size / 1e9:.2f} GB")
    print(f"Nodes uniques: {len(nodes_set)}")
    
    # Plage temporelle
    if timestamps_set:
        min_ts = min(timestamps_set)
        max_ts = max(timestamps_set)
        print(f"\nPlage temporelle des données:")
        print(f"  Début: {datetime.fromtimestamp(min_ts)} (timestamp: {min_ts})")
        print(f"  Fin:   {datetime.fromtimestamp(max_ts)} (timestamp: {max_ts})")
    
    # Détail par node (top 20)
    print("\n" + "=" * 70)
    print("DETAIL PAR NODE (nodes avec le plus de fichiers)")
    print("=" * 70)
    
    # Agréger par node
    node_stats = defaultdict(lambda: {'files': 0, 'size': 0, 'timestamps': set(), 'folders': set()})
    
    for folder, folder_data in inventory.items():
        for node_id, files in folder_data.items():
            node_stats[node_id]['files'] += len(files)
            node_stats[node_id]['size'] += sum(f['size'] for f in files)
            node_stats[node_id]['timestamps'].update(f['timestamp'] for f in files)
            node_stats[node_id]['folders'].add(folder)
    
    # Trier par nombre de fichiers
    sorted_nodes = sorted(node_stats.items(), key=lambda x: x[1]['files'], reverse=True)
    
    print(f"\n{'NODE':<8} {'FICHIERS':<10} {'TAILLE':<12} {'DATES':<25} {'DOSSIERS'}")
    print("-" * 90)
    
    for node_id, stats in sorted_nodes[:30]:
        ts_list = sorted(stats['timestamps'])
        if ts_list:
            date_range = f"{datetime.fromtimestamp(ts_list[0]).strftime('%Y-%m-%d %H:%M')} -> {datetime.fromtimestamp(ts_list[-1]).strftime('%H:%M')}"
        else:
            date_range = "N/A"
        
        folders = ", ".join(sorted(stats['folders']))
        print(f"b{node_id:<7} {stats['files']:<10} {stats['size']/1e6:.1f} MB    {date_range:<25} {folders}")
    
    # Dates uniques (jours)
    print("\n" + "=" * 70)
    print("JOURS DE DONNEES DISPONIBLES (basé sur timestamps)")
    print("=" * 70)
    
    days = set()
    for ts in timestamps_set:
        days.add(datetime.fromtimestamp(ts).strftime('%Y-%m-%d'))
    
    for day in sorted(days):
        # Compter les fichiers pour ce jour
        day_files = sum(1 for ts in timestamps_set 
                       if datetime.fromtimestamp(ts).strftime('%Y-%m-%d') == day)
        print(f"  {day}: ~{day_files} timestamps uniques")


if __name__ == '__main__':
    main()