seisee/scripts/generate_inventory.py

#!/usr/bin/env python3
"""
Script pour générer un inventaire HTML de tous les fichiers HDF5.
Affiche: numéro de bumper, canal, date/heure début, date/heure fin, durée, nombre d'échantillons.
"""

import os
import sys
import json
import h5py
import re
from datetime import datetime
from pathlib import Path
from collections import defaultdict

# Configuration
SAMPLE_RATE = 200  # Hz
DATA_DIRS = [
    r"F:\2020-09-11",
    r"E:\2020-09-11",
    r"E:\2020-09-14",
]

def parse_filename(filename):
    """
    Parse le nom de fichier HDF5 pour extraire les infos.
    Formats supportes:
      - auto_260_061316_b0_13_212626_data_rsn84614_seq1_ch0_1598976585.h5 (bumper = 13)
      - auto_255_061140_b119_12_230609_data_rsn5725_seq1_ch0_1599065292.h5 (bumper = 119)
    """
    bumper_id = None

    # Format 1: _b0_XX_ (ex: _b0_13_)
    bumper_match = re.search(r'_b0_(\d+)_', filename)
    if bumper_match:
        bumper_id = bumper_match.group(1)
    else:
        # Format 2: _bXXX_ (ex: _b119_)
        bumper_match = re.search(r'_b(\d+)_', filename)
        if bumper_match:
            bumper_id = bumper_match.group(1)

    # Extraire le canal (ch0, ch1, ch2, ch3, ch5, ch6, ch7, ch15)
    channel_match = re.search(r'_(ch\d+)_', filename)
    channel = channel_match.group(1) if channel_match else None

    # Extraire l'epoch time (dernier nombre avant .h5)
    epoch_match = re.search(r'_(\d{10})\.h5$', filename)
    epoch_time = int(epoch_match.group(1)) if epoch_match else None

    # Type de fichier (data ou aux)
    file_type = 'data' if '_data_' in filename else 'aux' if '_aux_' in filename else 'unknown'

    return {
        'bumper_id': bumper_id,
        'channel': channel,
        'epoch_time': epoch_time,
        'file_type': file_type
    }

def get_hdf5_info(filepath):
    """
    Ouvre le fichier HDF5 et récupère le nombre d'échantillons.
    """
    try:
        with h5py.File(filepath, 'r') as f:
            # Chercher le dataset adc_values
            if 'adc_values' in f:
                samples = f['adc_values'].shape[0]
                return {'samples': samples, 'error': None}
            else:
                # Lister les datasets disponibles
                datasets = list(f.keys())
                return {'samples': 0, 'error': f'No adc_values, found: {datasets}'}
    except Exception as e:
        return {'samples': 0, 'error': str(e)}

def format_datetime(epoch_time):
    """Formate un timestamp en date/heure lisible."""
    if not epoch_time:
        return "N/A"
    dt = datetime.fromtimestamp(epoch_time)
    return dt.strftime('%Y-%m-%d %H:%M:%S')

def format_duration(seconds):
    """Formate une durée en heures:minutes:secondes."""
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    if hours > 0:
        return f"{hours}h {minutes}m {secs}s"
    elif minutes > 0:
        return f"{minutes}m {secs}s"
    else:
        return f"{secs}s"

def scan_directory(data_dir):
    """Scanne un répertoire pour trouver tous les fichiers HDF5."""
    files = []
    data_path = Path(data_dir) / 'data'

    if not data_path.exists():
        print(f"  Directory not found: {data_path}")
        return files

    for filepath in data_path.glob('*.h5'):
        files.append(filepath)

    return files

def generate_html(inventory, output_path):
    """Génère le document HTML."""

    # Organiser par bumper puis par canal
    by_bumper = defaultdict(lambda: defaultdict(list))

    for item in inventory:
        bumper = item['bumper_id'] or 'unknown'
        channel = item['channel'] or 'unknown'
        by_bumper[bumper][channel].append(item)

    # Trier les bumpers numériquement
    sorted_bumpers = sorted(by_bumper.keys(), key=lambda x: int(x) if x.isdigit() else 999)

    # Statistiques globales
    total_files = len(inventory)
    total_samples = sum(i['samples'] for i in inventory)
    total_duration = total_samples / SAMPLE_RATE
    total_errors = sum(1 for i in inventory if i['error'])

    # Compter par canal
    channel_stats = defaultdict(lambda: {'files': 0, 'samples': 0, 'bumpers': set()})
    for item in inventory:
        ch = item['channel'] or 'unknown'
        channel_stats[ch]['files'] += 1
        channel_stats[ch]['samples'] += item['samples']
        if item['bumper_id']:
            channel_stats[ch]['bumpers'].add(item['bumper_id'])

    html = f"""<!DOCTYPE html>
<html lang="fr">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Inventaire Fichiers HDF5 Sismiques</title>
    <style>
        * {{ box-sizing: border-box; }}
        body {{
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
            background: #0a0a1a;
            color: #eee;
            margin: 0;
            padding: 20px;
        }}
        h1 {{
            color: #4ade80;
            border-bottom: 2px solid #4ade80;
            padding-bottom: 10px;
        }}
        h2 {{
            color: #e94560;
            margin-top: 30px;
        }}
        h3 {{
            color: #fbbf24;
            margin-top: 20px;
        }}
        .stats {{
            background: #16213e;
            padding: 20px;
            border-radius: 8px;
            margin-bottom: 30px;
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
            gap: 20px;
        }}
        .stat-box {{
            background: #0f3460;
            padding: 15px;
            border-radius: 6px;
            text-align: center;
        }}
        .stat-value {{
            font-size: 2rem;
            font-weight: bold;
            color: #4ade80;
        }}
        .stat-label {{
            color: #888;
            font-size: 0.9rem;
        }}
        .channel-summary {{
            display: grid;
            grid-template-columns: repeat(4, 1fr);
            gap: 10px;
            margin-bottom: 30px;
        }}
        .channel-box {{
            background: #16213e;
            padding: 15px;
            border-radius: 6px;
            text-align: center;
        }}
        .channel-box h4 {{
            margin: 0 0 10px 0;
            color: #4ade80;
        }}
        table {{
            width: 100%;
            border-collapse: collapse;
            margin-bottom: 20px;
            font-size: 0.9rem;
        }}
        th, td {{
            padding: 10px;
            text-align: left;
            border-bottom: 1px solid #1a1a2e;
        }}
        th {{
            background: #16213e;
            color: #4ade80;
            position: sticky;
            top: 0;
        }}
        tr:hover {{
            background: #16213e;
        }}
        .ch0 {{ color: #4ade80; }}
        .ch1 {{ color: #60a5fa; }}
        .ch2 {{ color: #fbbf24; }}
        .ch3 {{ color: #f472b6; }}
        .data {{ color: #4ade80; }}
        .aux {{ color: #888; }}
        .error {{ color: #e94560; font-size: 0.8rem; }}
        .bumper-section {{
            background: #0f3460;
            padding: 15px;
            border-radius: 8px;
            margin-bottom: 20px;
        }}
        .filter-controls {{
            background: #16213e;
            padding: 15px;
            border-radius: 8px;
            margin-bottom: 20px;
            display: flex;
            gap: 20px;
            flex-wrap: wrap;
        }}
        .filter-controls label {{
            display: flex;
            align-items: center;
            gap: 8px;
            cursor: pointer;
        }}
        input[type="checkbox"] {{
            width: 18px;
            height: 18px;
        }}
        .summary-table {{
            width: auto;
            margin: 0 auto;
        }}
        .summary-table td {{
            padding: 5px 15px;
        }}
    </style>
</head>
<body>
    <h1>📊 Inventaire Fichiers HDF5 Sismiques</h1>
    <p>Généré le {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>

    <div class="stats">
        <div class="stat-box">
            <div class="stat-value">{total_files}</div>
            <div class="stat-label">Fichiers HDF5</div>
        </div>
        <div class="stat-box">
            <div class="stat-value">{len(sorted_bumpers)}</div>
            <div class="stat-label">Bumpers (nodes)</div>
        </div>
        <div class="stat-box">
            <div class="stat-value">{total_samples:,}</div>
            <div class="stat-label">Échantillons total</div>
        </div>
        <div class="stat-box">
            <div class="stat-value">{format_duration(total_duration)}</div>
            <div class="stat-label">Durée totale @ 200Hz</div>
        </div>
        <div class="stat-box">
            <div class="stat-value">{total_errors}</div>
            <div class="stat-label">Erreurs lecture</div>
        </div>
    </div>

    <h2>📡 Résumé par Canal</h2>
    <div class="channel-summary">
"""

    for ch in ['ch0', 'ch1', 'ch2', 'ch3']:
        stats = channel_stats.get(ch, {'files': 0, 'samples': 0, 'bumpers': set()})
        duration = stats['samples'] / SAMPLE_RATE
        html += f"""
        <div class="channel-box">
            <h4 class="{ch}">{ch.upper()}</h4>
            <div><strong>{stats['files']}</strong> fichiers</div>
            <div><strong>{len(stats['bumpers'])}</strong> bumpers</div>
            <div><strong>{stats['samples']:,}</strong> samples</div>
            <div>{format_duration(duration)}</div>
        </div>
"""

    html += """
    </div>

    <h2>📋 Détail par Bumper</h2>

    <div class="filter-controls">
        <label><input type="checkbox" id="showCh0" checked onchange="filterTable()"> <span class="ch0">CH0</span></label>
        <label><input type="checkbox" id="showCh1" checked onchange="filterTable()"> <span class="ch1">CH1</span></label>
        <label><input type="checkbox" id="showCh2" checked onchange="filterTable()"> <span class="ch2">CH2</span></label>
        <label><input type="checkbox" id="showCh3" checked onchange="filterTable()"> <span class="ch3">CH3</span></label>
        <label><input type="checkbox" id="showData" checked onchange="filterTable()"> <span class="data">DATA</span></label>
        <label><input type="checkbox" id="showAux" checked onchange="filterTable()"> <span class="aux">AUX</span></label>
    </div>

    <table id="mainTable">
        <thead>
            <tr>
                <th>Bumper</th>
                <th>Canal</th>
                <th>Type</th>
                <th>Début (epoch)</th>
                <th>Début (date/heure)</th>
                <th>Fin (date/heure)</th>
                <th>Durée</th>
                <th>Samples</th>
                <th>Fichier</th>
            </tr>
        </thead>
        <tbody>
"""

    for bumper in sorted_bumpers:
        channels = by_bumper[bumper]
        for channel in sorted(channels.keys()):
            items = sorted(channels[channel], key=lambda x: x['epoch_time'] or 0)
            for item in items:
                duration_sec = item['samples'] / SAMPLE_RATE
                end_time = (item['epoch_time'] + duration_sec) if item['epoch_time'] else None

                error_html = f'<div class="error">{item["error"]}</div>' if item['error'] else ''

                html += f"""
            <tr class="row-{channel} row-{item['file_type']}">
                <td><strong>b{bumper}</strong></td>
                <td class="{channel}">{channel.upper()}</td>
                <td class="{item['file_type']}">{item['file_type'].upper()}</td>
                <td>{item['epoch_time'] or 'N/A'}</td>
                <td>{format_datetime(item['epoch_time'])}</td>
                <td>{format_datetime(end_time)}</td>
                <td>{format_duration(duration_sec)}</td>
                <td>{item['samples']:,}</td>
                <td style="font-size: 0.8rem; color: #888;">{item['filename']}{error_html}</td>
            </tr>
"""

    html += """
        </tbody>
    </table>

    <script>
        function filterTable() {
            const showCh0 = document.getElementById('showCh0').checked;
            const showCh1 = document.getElementById('showCh1').checked;
            const showCh2 = document.getElementById('showCh2').checked;
            const showCh3 = document.getElementById('showCh3').checked;
            const showData = document.getElementById('showData').checked;
            const showAux = document.getElementById('showAux').checked;

            const rows = document.querySelectorAll('#mainTable tbody tr');
            rows.forEach(row => {
                const isCh0 = row.classList.contains('row-ch0');
                const isCh1 = row.classList.contains('row-ch1');
                const isCh2 = row.classList.contains('row-ch2');
                const isCh3 = row.classList.contains('row-ch3');
                const isData = row.classList.contains('row-data');
                const isAux = row.classList.contains('row-aux');

                const channelVisible = (isCh0 && showCh0) || (isCh1 && showCh1) ||
                                       (isCh2 && showCh2) || (isCh3 && showCh3);
                const typeVisible = (isData && showData) || (isAux && showAux);

                row.style.display = (channelVisible && typeVisible) ? '' : 'none';
            });
        }
    </script>
</body>
</html>
"""

    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(html)

    print(f"\nHTML genere: {output_path}")

def main():
    print("=" * 60)
    print("INVENTAIRE DES FICHIERS HDF5 SISMIQUES")
    print("=" * 60)

    # Charger l'index existant pour connaître tous les répertoires
    index_path = Path(r"F:\seismic_webapp\data\index.json")
    all_dirs = set()

    if index_path.exists():
        with open(index_path, 'r') as f:
            index = json.load(f)

        # Récupérer tous les répertoires de dates
        for node_data in index.get('nodes', {}).values():
            for files_list in node_data.get('dates', {}).values():
                # files_list est une liste de fichiers directement
                if isinstance(files_list, list):
                    for file_info in files_list:
                        file_path = Path(file_info.get('path', ''))
                        if file_path.parent.parent.exists():
                            all_dirs.add(str(file_path.parent.parent))

    # Ajouter les répertoires par défaut
    for d in DATA_DIRS:
        if Path(d).exists():
            all_dirs.add(d)

    print(f"\nRépertoires à scanner: {len(all_dirs)}")
    for d in sorted(all_dirs):
        print(f"  - {d}")

    # Scanner tous les fichiers
    inventory = []

    for data_dir in sorted(all_dirs):
        print(f"\nScanning {data_dir}...")
        files = scan_directory(data_dir)
        print(f"  Found {len(files)} HDF5 files")

        for i, filepath in enumerate(files):
            if i % 50 == 0:
                print(f"  Processing {i}/{len(files)}...")

            parsed = parse_filename(filepath.name)
            hdf5_info = get_hdf5_info(filepath)

            inventory.append({
                'filepath': str(filepath),
                'filename': filepath.name,
                'directory': data_dir,
                'bumper_id': parsed['bumper_id'],
                'channel': parsed['channel'],
                'epoch_time': parsed['epoch_time'],
                'file_type': parsed['file_type'],
                'samples': hdf5_info['samples'],
                'error': hdf5_info['error']
            })

    print(f"\nTotal: {len(inventory)} fichiers")

    # Générer le HTML
    output_path = Path(r"F:\seismic_webapp\inventory.html")
    generate_html(inventory, output_path)

    # Aussi sauvegarder en JSON pour référence
    json_path = Path(r"F:\seismic_webapp\inventory.json")
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(inventory, f, indent=2, ensure_ascii=False)
    print(f"JSON genere: {json_path}")

if __name__ == '__main__':
    main()