paper_2025/python/condense_inv_data_log-hp.py

import numpy as np
import matplotlib.pyplot as plt
from thunderhopper.filetools import search_files
from thunderhopper.modeltools import load_data, save_data
from misc_functions import shorten_species, sort_files_by_rec
from IPython import embed

# GENERAL SETTINGS:
target_species = [
    'Chorthippus_biguttulus',
    'Chorthippus_mollis',
    'Chrysochraon_dispar',
    'Euchorthippus_declivus',
    'Gomphocerippus_rufus',
    'Omocestus_rufipes',
    'Pseudochorthippus_parallelus',
]
sources = [
    'BM04',
    'BM93',
    'DJN',
    'GBC',
    'FTN'
]
search_path = '../data/inv/log_hp/'
save_path = '../data/inv/log_hp/condensed/'

# ANALYSIS SETTINGS:
compute_ratios = True
plot_overview = True

# PREPARATION:
if plot_overview:
    fig, axes = plt.subplots(3, len(target_species), figsize=(16, 9),
                             sharex=True, sharey=True, layout='constrained')
    axes[0, 0].set_ylabel('songs')
    axes[1, 0].set_ylabel('recordings\n(mean ± SD)')
    axes[2, 0].set_ylabel('total\n(mean ± SD)')

# EXECUTION:
for i, species in enumerate(target_species):
    print(f'Processing {species}')
    if plot_overview:
        axes[0, i].set_title(shorten_species(species))

    # Fetch all species-specific song files:
    all_paths = search_files(species, incl='noise', ext='npz', dir=search_path)

    # Sort song files by recording (one or more per source):
    sorted_paths = sort_files_by_rec(all_paths, sources)

    # Condense across song files per recording:
    for j, rec_paths in enumerate(sorted_paths):
        for k, path in enumerate(rec_paths):

            # Load invariance data:
            data, config = load_data(path, ['scales', 'measure_inv'])
            scales, measure = data['scales'], data['measure_inv']

            # Relate to noise:
            if compute_ratios:
                measure /= measure[0]

            if k == 0:
                # Prepare song file-specific storage:
                file_data = np.zeros((scales.size, len(rec_paths)), dtype=float)
                if j == 0:
                    # Prepare recording-specific storage:
                    rec_mean = np.zeros((scales.size, len(sorted_paths)), dtype=float)
                    rec_sd = np.zeros((scales.size, len(sorted_paths)), dtype=float)

            # Log song file data:
            file_data[:, k] = measure

            if plot_overview:
                axes[0, i].plot(scales, measure, c='k', alpha=0.5)

        # Get recording statistics:
        rec_mean[:, j] = file_data.mean(axis=1)
        rec_sd[:, j] = file_data.std(axis=1)

        if plot_overview:
            axes[1, i].plot(scales, rec_mean[:, j], c='k')
            axes[1, i].fill_between(scales, rec_mean[:, j] - rec_sd[:, j],
                                    rec_mean[:, j] + rec_sd[:, j], color='k', alpha=0.2)

    # Save condensed recording data for current species:
    archive = dict(scales=scales, mean_inv=rec_mean, sd_inv=rec_sd)
    save_data(save_path + species, archive, config, overwrite=True)

    if plot_overview:
        spec_mean = rec_mean.mean(axis=1)
        spec_sd = rec_mean.std(axis=1)
        axes[2, i].plot(scales, spec_mean, c='k')
        axes[2, i].fill_between(scales, spec_mean - spec_sd, spec_mean + spec_sd,
                                color='k', alpha=0.2)

print('Done.')

if plot_overview:
    axes[0, 0].set_xscale('log')
    axes[0, 0].set_yscale('log')
    axes[0, 0].set_xlim(scales[1], scales[-1])
plt.show()