paper_2025/python/condense_inv_data_thresh-lp.py

import numpy as np
import matplotlib.pyplot as plt
from thunderhopper.filetools import search_files
from thunderhopper.modeltools import load_data, save_data
from misc_functions import shorten_species, sort_files_by_rec, divide_by_zero
from IPython import embed

# GENERAL SETTINGS:
target_species = [
    'Chorthippus_biguttulus',
    'Chorthippus_mollis',
    'Chrysochraon_dispar',
    'Euchorthippus_declivus',
    'Gomphocerippus_rufus',
    'Omocestus_rufipes',
    'Pseudochorthippus_parallelus',
]
sources = [
    'BM04',
    'BM93',
    'DJN',
    'GBC',
    'FTN'
]
search_path = '../data/inv/thresh_lp/'
save_path = '../data/inv/thresh_lp/condensed/'

# ANALYSIS SETTINGS:
mode = ['pure', 'noise'][1]
normalization = [
    'none',
    'min',
    'max',
    'base',
    'range',
    ][0]
suffix = dict(
    none='_unnormed',
    min='_norm-min',
    max='_norm-max',
    base='_norm-base',
    range='_norm-range'
)[normalization]
plot_overview = False
thresh_rel = np.array([0.5, 1, 3])

# PREPARATION:
if plot_overview:
    kern_colors = ['r', 'g', 'b']
    all_figs, all_axes = {}, {}
    for thresh in thresh_rel:
        fig, axes = plt.subplots(3, len(target_species), figsize=(16, 9),
                                 sharex=True, sharey=True, layout='constrained')
        fig.suptitle(f'rel. thresh: {thresh}')
        axes[0, 0].set_ylim(0, 1)
        axes[0, 0].set_ylabel('songs')
        axes[1, 0].set_ylabel('recordings\n(mean ± SD)')
        axes[2, 0].set_ylabel('total\n(mean ± SD)')
        all_figs[thresh] = fig
        all_axes[thresh] = axes

# EXECUTION:
for i, species in enumerate(target_species):
    print(f'Processing {species}')
    if plot_overview:
        for thresh in thresh_rel:
            all_axes[thresh][0, i].set_title(shorten_species(species))

    # Fetch all species-specific song files:
    all_paths = search_files(species, incl=mode, ext='npz', dir=search_path)

    # Sort song files by recording (one or more per source):
    sorted_paths = sort_files_by_rec(all_paths, sources)

    # Condense across song files per recording:
    for j, rec_paths in enumerate(sorted_paths):
        for k, path in enumerate(rec_paths):

            # Load invariance data:
            data, config = load_data(path, ['scales', 'measure_feat'])
            scales, measure = data['scales'], data['measure_feat']

            if k == 0:
                # Prepare song file-specific storage:
                shape = measure.shape + (len(rec_paths),)
                file_data = np.zeros(shape, dtype=float)
                if j == 0:
                    # Prepare recording-specific storage:
                    shape = measure.shape + (len(sorted_paths),)
                    rec_mean = np.zeros(shape, dtype=float)
                    rec_sd = np.zeros(shape, dtype=float)

            # Log song file data:
                if normalization == 'min':
                    # Minimum normalization:
                    measure = divide_by_zero(measure, measure.min(axis=0))
                    # measure /= measure.min(axis=0, keepdims=True)
                elif normalization == 'max':
                    # Maximum normalization:
                    measure = divide_by_zero(measure, measure.max(axis=0))
                    # measure /= measure.max(axis=0, keepdims=True)
                elif normalization == 'base':
                    # Noise baseline normalization:
                    measure = divide_by_zero(measure, measure[0])
                    # measure /= measure[0]
                elif normalization == 'range':
                    # Min-max normalization:
                    min_measure = measure.min(axis=0, keepdims=True)
                    max_measure = measure.max(axis=0, keepdims=True)
                    measure = divide_by_zero(measure - min_measure, max_measure - min_measure)
                    # measure = (measure - min_measure) / (max_measure - min_measure)

            file_data[..., k] = measure

            if plot_overview:
                for l, thresh in enumerate(thresh_rel):
                    axes = all_axes[thresh]
                    for m, c in enumerate(kern_colors):
                        axes[0, i].plot(scales, measure[:, m, l], c=c, alpha=0.5)

        # Get recording statistics:
        rec_mean[..., j] = np.nanmean(file_data, axis=-1)
        rec_sd[..., j] = np.nanstd(file_data, axis=-1)

        if plot_overview:
            for l, thresh in enumerate(thresh_rel):
                axes = all_axes[thresh]
                for m, c in enumerate(kern_colors):
                    axes[1, i].plot(scales, rec_mean[:, m, l, j], c=c)
                    spread = (rec_mean[:, m, l, j] - rec_sd[:, m, l, j],
                              rec_mean[:, m, l, j] + rec_sd[:, m, l, j])
                    axes[1, i].fill_between(scales, *spread, color=c, alpha=0.2)

    # Save condensed recording data:
    save_name = save_path + species + '_' + mode + suffix
    archive = dict(
        scales=scales,
        mean_feat=rec_mean,
        sd_feat=rec_sd,
        thresh_rel=thresh_rel,)
    save_data(save_name, archive, config)

    if plot_overview:
        spec_mean = rec_mean.mean(axis=-1)
        spec_sd = rec_mean.std(axis=-1)
        for l, thresh in enumerate(thresh_rel):
            axes = all_axes[thresh]
            for m, c in enumerate(kern_colors):
                axes[2, i].plot(scales, spec_mean[:, m, l], c=c)
                spread = (spec_mean[:, m, l] - spec_sd[:, m, l],
                          spec_mean[:, m, l] + spec_sd[:, m, l])
                axes[2, i].fill_between(scales, *spread, color=c, alpha=0.2)

print('Done.')

if plot_overview:
    for thresh in thresh_rel:
        axes = all_axes[thresh]
        axes[0, 0].set_xscale('log')
        axes[0, 0].set_xlim(scales[1], scales[-1])
plt.show()