import numpy as np from thunderhopper.filetools import search_files, crop_paths from thunderhopper.modeltools import load_data, save_data from IPython import embed def sort_files_by_rec(paths, sources=['JJ', 'SLO']): # Separate by source: sorted_paths = {} for source in sources: # Check for any source-specific song files: source_paths = [path for path in paths if source in path] if not source_paths: continue # Separate by recording: sorted_paths[source] = {} for path, name in zip(source_paths, crop_paths(source_paths)): # Find global time stamp behind source tag: ind = name.find(source) + len(source) + 1 time_stamps = name[ind:].split('_')[-1] global_time = '-'.join(time_stamps.split('-')[:2]) if global_time in sorted_paths[source]: # Found existing time stamp (known recording): sorted_paths[source][global_time].append(path) else: # Found new time stamp (novel recording): sorted_paths[source][global_time] = [path] # Re-sort song files by recording only (discarding source separation): flat_sorted = [] for source_paths in sorted_paths.values(): for rec_paths in source_paths.values(): flat_sorted.append(rec_paths) return flat_sorted # GENERAL SETTINGS: target_species = ['Pseudochorthippus_parallelus'] mode = ['song', 'noise'][0] stages = ['raw', 'filt', 'env', 'log', 'inv', 'conv', 'feat'] search_path = f'../data/inv/field/{mode}/' save_path = f'../data/inv/field/{mode}/condensed/' sources = [ 'JJ', 'SLO', ] # ANALYSIS SETTINGS: normalization = 'none' if mode == 'song': normalization = [ 'none', # 'base', 'range' ][-1] # EXECUTION: for i, species in enumerate(target_species): print(f'Processing {species}') # Fetch all species-specific song files: all_paths = search_files(species, ext='npz', dir=search_path) if not all_paths: continue # Sort song files by recording (one or more per source): sorted_paths = sort_files_by_rec(all_paths, sources) # Condense across song files per recording: for j, rec_paths in enumerate(sorted_paths): for k, path in enumerate(rec_paths): # Load invariance data: data, config = load_data(path, 'distances', 'measure') if k == 0: # Prepare song file-specific storage: file_data = {} for stage in stages: shape = data[f'measure_{stage}'].shape + (len(rec_paths),) file_data[stage] = np.zeros(shape, dtype=float) if j == 0: # Prepare recording-specific storage: rec_mean, rec_sd = {}, {} for stage in stages: shape = data[f'measure_{stage}'].shape + (len(sorted_paths),) rec_mean[f'mean_{stage}'] = np.zeros(shape, dtype=float) rec_sd[f'sd_{stage}'] = np.zeros(shape, dtype=float) # Log song file data: for stage in stages: mkey = f'measure_{stage}' if normalization == 'range': # Min-max normalization: min_measure = data[mkey].min(axis=0, keepdims=True) max_measure = data[mkey].max(axis=0, keepdims=True) data[mkey] = (data[mkey] - min_measure) / (max_measure - min_measure) file_data[stage][..., k] = data[mkey] # Get recording statistics: for stage in stages: rec_mean[f'mean_{stage}'][..., j] = np.nanmean(file_data[stage], axis=-1) rec_sd[f'sd_{stage}'][..., j] = np.nanstd(file_data[stage], axis=-1) # Save condensed recording data: save_name = save_path + species if normalization == 'none': save_name += '_unnormed' elif normalization == 'base': save_name += '_norm-base' elif normalization == 'range': save_name += '_norm-range' archive = dict(distances=data['distances']) archive.update(rec_mean) archive.update(rec_sd) save_data(save_name, archive, config, overwrite=True) print('Done.')