""" Script d'inventaire des fichiers HDF5. Extrait les timestamps des noms de fichiers et génère un rapport. """ import os import re from pathlib import Path from datetime import datetime from collections import defaultdict # Dossiers racine DATA_ROOTS = [ Path(r"F:\2020-09-12"), Path(r"F:\2020-09-13"), Path(r"F:\2020-09-14"), Path(r"F:\2020-09-15"), Path(r"F:\2020-09-16"), Path(r"F:\2020-09-17"), Path(r"F:\2020-09-18"), Path(r"F:\2020-09-19"), Path(r"F:\2020-09-21"), Path(r"F:\2020-09-22"), Path(r"F:\2020-09-23"), ] # Pattern pour extraire node_id et timestamp # Exemple: auto_256_070617_b67_14_025708_data_rsn6027_seq1_ch0_1599057453.h5 PATTERN = re.compile(r'_b(\d+)_.*?(\d{10})\.h5$', re.IGNORECASE) def main(): print("=" * 70) print("INVENTAIRE DES FICHIERS HDF5") print("=" * 70) # Structure: folder -> node_id -> list of (timestamp, filename, type) inventory = defaultdict(lambda: defaultdict(list)) # Stats globales total_files = 0 total_size = 0 nodes_set = set() timestamps_set = set() for root in DATA_ROOTS: if not root.exists(): continue folder_name = root.name for h5_file in root.rglob("*.h5"): match = PATTERN.search(h5_file.name) if not match: continue node_id = match.group(1) timestamp = int(match.group(2)) # Déterminer le type (data ou aux) file_type = "data" if "_data_" in h5_file.name else "aux" if "_aux_" in h5_file.name else "unknown" # Extraire le channel si présent ch_match = re.search(r'_ch(\d+)_', h5_file.name) channel = f"ch{ch_match.group(1)}" if ch_match else "?" file_size = h5_file.stat().st_size inventory[folder_name][node_id].append({ 'timestamp': timestamp, 'datetime': datetime.fromtimestamp(timestamp), 'type': file_type, 'channel': channel, 'filename': h5_file.name, 'size': file_size }) total_files += 1 total_size += file_size nodes_set.add(node_id) timestamps_set.add(timestamp) # Rapport par dossier print(f"\n{'DOSSIER':<15} {'NODES':<10} {'FICHIERS':<10} {'TAILLE':<15}") print("-" * 50) for folder in sorted(inventory.keys()): folder_data = inventory[folder] n_nodes = len(folder_data) n_files = sum(len(files) for files in folder_data.values()) folder_size = sum(f['size'] for files in folder_data.values() for f in files) print(f"{folder:<15} {n_nodes:<10} {n_files:<10} {folder_size / 1e9:.2f} GB") # Stats globales print("\n" + "=" * 70) print("STATISTIQUES GLOBALES") print("=" * 70) print(f"Fichiers H5 totaux: {total_files}") print(f"Taille totale: {total_size / 1e9:.2f} GB") print(f"Nodes uniques: {len(nodes_set)}") # Plage temporelle if timestamps_set: min_ts = min(timestamps_set) max_ts = max(timestamps_set) print(f"\nPlage temporelle des données:") print(f" Début: {datetime.fromtimestamp(min_ts)} (timestamp: {min_ts})") print(f" Fin: {datetime.fromtimestamp(max_ts)} (timestamp: {max_ts})") # Détail par node (top 20) print("\n" + "=" * 70) print("DETAIL PAR NODE (nodes avec le plus de fichiers)") print("=" * 70) # Agréger par node node_stats = defaultdict(lambda: {'files': 0, 'size': 0, 'timestamps': set(), 'folders': set()}) for folder, folder_data in inventory.items(): for node_id, files in folder_data.items(): node_stats[node_id]['files'] += len(files) node_stats[node_id]['size'] += sum(f['size'] for f in files) node_stats[node_id]['timestamps'].update(f['timestamp'] for f in files) node_stats[node_id]['folders'].add(folder) # Trier par nombre de fichiers sorted_nodes = sorted(node_stats.items(), key=lambda x: x[1]['files'], reverse=True) print(f"\n{'NODE':<8} {'FICHIERS':<10} {'TAILLE':<12} {'DATES':<25} {'DOSSIERS'}") print("-" * 90) for node_id, stats in sorted_nodes[:30]: ts_list = sorted(stats['timestamps']) if ts_list: date_range = f"{datetime.fromtimestamp(ts_list[0]).strftime('%Y-%m-%d %H:%M')} -> {datetime.fromtimestamp(ts_list[-1]).strftime('%H:%M')}" else: date_range = "N/A" folders = ", ".join(sorted(stats['folders'])) print(f"b{node_id:<7} {stats['files']:<10} {stats['size']/1e6:.1f} MB {date_range:<25} {folders}") # Dates uniques (jours) print("\n" + "=" * 70) print("JOURS DE DONNEES DISPONIBLES (basé sur timestamps)") print("=" * 70) days = set() for ts in timestamps_set: days.add(datetime.fromtimestamp(ts).strftime('%Y-%m-%d')) for day in sorted(days): # Compter les fichiers pour ce jour day_files = sum(1 for ts in timestamps_set if datetime.fromtimestamp(ts).strftime('%Y-%m-%d') == day) print(f" {day}: ~{day_files} timestamps uniques") if __name__ == '__main__': main()