Fix coverage: add /api/coverage route, remove stray gather code from loadCoverage
This commit is contained in:
158
scripts/inventory_h5.py
Executable file
158
scripts/inventory_h5.py
Executable file
@@ -0,0 +1,158 @@
|
||||
"""
|
||||
Script d'inventaire des fichiers HDF5.
|
||||
Extrait les timestamps des noms de fichiers et génère un rapport.
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from collections import defaultdict
|
||||
|
||||
# Dossiers racine
|
||||
DATA_ROOTS = [
|
||||
Path(r"F:\2020-09-12"),
|
||||
Path(r"F:\2020-09-13"),
|
||||
Path(r"F:\2020-09-14"),
|
||||
Path(r"F:\2020-09-15"),
|
||||
Path(r"F:\2020-09-16"),
|
||||
Path(r"F:\2020-09-17"),
|
||||
Path(r"F:\2020-09-18"),
|
||||
Path(r"F:\2020-09-19"),
|
||||
Path(r"F:\2020-09-21"),
|
||||
Path(r"F:\2020-09-22"),
|
||||
Path(r"F:\2020-09-23"),
|
||||
]
|
||||
|
||||
# Pattern pour extraire node_id et timestamp
|
||||
# Exemple: auto_256_070617_b67_14_025708_data_rsn6027_seq1_ch0_1599057453.h5
|
||||
PATTERN = re.compile(r'_b(\d+)_.*?(\d{10})\.h5$', re.IGNORECASE)
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 70)
|
||||
print("INVENTAIRE DES FICHIERS HDF5")
|
||||
print("=" * 70)
|
||||
|
||||
# Structure: folder -> node_id -> list of (timestamp, filename, type)
|
||||
inventory = defaultdict(lambda: defaultdict(list))
|
||||
|
||||
# Stats globales
|
||||
total_files = 0
|
||||
total_size = 0
|
||||
nodes_set = set()
|
||||
timestamps_set = set()
|
||||
|
||||
for root in DATA_ROOTS:
|
||||
if not root.exists():
|
||||
continue
|
||||
|
||||
folder_name = root.name
|
||||
|
||||
for h5_file in root.rglob("*.h5"):
|
||||
match = PATTERN.search(h5_file.name)
|
||||
if not match:
|
||||
continue
|
||||
|
||||
node_id = match.group(1)
|
||||
timestamp = int(match.group(2))
|
||||
|
||||
# Déterminer le type (data ou aux)
|
||||
file_type = "data" if "_data_" in h5_file.name else "aux" if "_aux_" in h5_file.name else "unknown"
|
||||
|
||||
# Extraire le channel si présent
|
||||
ch_match = re.search(r'_ch(\d+)_', h5_file.name)
|
||||
channel = f"ch{ch_match.group(1)}" if ch_match else "?"
|
||||
|
||||
file_size = h5_file.stat().st_size
|
||||
|
||||
inventory[folder_name][node_id].append({
|
||||
'timestamp': timestamp,
|
||||
'datetime': datetime.fromtimestamp(timestamp),
|
||||
'type': file_type,
|
||||
'channel': channel,
|
||||
'filename': h5_file.name,
|
||||
'size': file_size
|
||||
})
|
||||
|
||||
total_files += 1
|
||||
total_size += file_size
|
||||
nodes_set.add(node_id)
|
||||
timestamps_set.add(timestamp)
|
||||
|
||||
# Rapport par dossier
|
||||
print(f"\n{'DOSSIER':<15} {'NODES':<10} {'FICHIERS':<10} {'TAILLE':<15}")
|
||||
print("-" * 50)
|
||||
|
||||
for folder in sorted(inventory.keys()):
|
||||
folder_data = inventory[folder]
|
||||
n_nodes = len(folder_data)
|
||||
n_files = sum(len(files) for files in folder_data.values())
|
||||
folder_size = sum(f['size'] for files in folder_data.values() for f in files)
|
||||
print(f"{folder:<15} {n_nodes:<10} {n_files:<10} {folder_size / 1e9:.2f} GB")
|
||||
|
||||
# Stats globales
|
||||
print("\n" + "=" * 70)
|
||||
print("STATISTIQUES GLOBALES")
|
||||
print("=" * 70)
|
||||
print(f"Fichiers H5 totaux: {total_files}")
|
||||
print(f"Taille totale: {total_size / 1e9:.2f} GB")
|
||||
print(f"Nodes uniques: {len(nodes_set)}")
|
||||
|
||||
# Plage temporelle
|
||||
if timestamps_set:
|
||||
min_ts = min(timestamps_set)
|
||||
max_ts = max(timestamps_set)
|
||||
print(f"\nPlage temporelle des données:")
|
||||
print(f" Début: {datetime.fromtimestamp(min_ts)} (timestamp: {min_ts})")
|
||||
print(f" Fin: {datetime.fromtimestamp(max_ts)} (timestamp: {max_ts})")
|
||||
|
||||
# Détail par node (top 20)
|
||||
print("\n" + "=" * 70)
|
||||
print("DETAIL PAR NODE (nodes avec le plus de fichiers)")
|
||||
print("=" * 70)
|
||||
|
||||
# Agréger par node
|
||||
node_stats = defaultdict(lambda: {'files': 0, 'size': 0, 'timestamps': set(), 'folders': set()})
|
||||
|
||||
for folder, folder_data in inventory.items():
|
||||
for node_id, files in folder_data.items():
|
||||
node_stats[node_id]['files'] += len(files)
|
||||
node_stats[node_id]['size'] += sum(f['size'] for f in files)
|
||||
node_stats[node_id]['timestamps'].update(f['timestamp'] for f in files)
|
||||
node_stats[node_id]['folders'].add(folder)
|
||||
|
||||
# Trier par nombre de fichiers
|
||||
sorted_nodes = sorted(node_stats.items(), key=lambda x: x[1]['files'], reverse=True)
|
||||
|
||||
print(f"\n{'NODE':<8} {'FICHIERS':<10} {'TAILLE':<12} {'DATES':<25} {'DOSSIERS'}")
|
||||
print("-" * 90)
|
||||
|
||||
for node_id, stats in sorted_nodes[:30]:
|
||||
ts_list = sorted(stats['timestamps'])
|
||||
if ts_list:
|
||||
date_range = f"{datetime.fromtimestamp(ts_list[0]).strftime('%Y-%m-%d %H:%M')} -> {datetime.fromtimestamp(ts_list[-1]).strftime('%H:%M')}"
|
||||
else:
|
||||
date_range = "N/A"
|
||||
|
||||
folders = ", ".join(sorted(stats['folders']))
|
||||
print(f"b{node_id:<7} {stats['files']:<10} {stats['size']/1e6:.1f} MB {date_range:<25} {folders}")
|
||||
|
||||
# Dates uniques (jours)
|
||||
print("\n" + "=" * 70)
|
||||
print("JOURS DE DONNEES DISPONIBLES (basé sur timestamps)")
|
||||
print("=" * 70)
|
||||
|
||||
days = set()
|
||||
for ts in timestamps_set:
|
||||
days.add(datetime.fromtimestamp(ts).strftime('%Y-%m-%d'))
|
||||
|
||||
for day in sorted(days):
|
||||
# Compter les fichiers pour ce jour
|
||||
day_files = sum(1 for ts in timestamps_set
|
||||
if datetime.fromtimestamp(ts).strftime('%Y-%m-%d') == day)
|
||||
print(f" {day}: ~{day_files} timestamps uniques")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user