\documentclass[a4paper, 12pt]{article} \usepackage[left=2cm,right=2cm,top=2cm,bottom=2cm,includeheadfoot]{geometry} \usepackage[onehalfspacing]{setspace} \usepackage{graphicx} \usepackage{svg} \usepackage{import} \usepackage{float} \usepackage{placeins} \usepackage{parskip} \usepackage{amsmath} \usepackage{amssymb} \usepackage{subcaption} \usepackage[labelfont=bf, textfont=small]{caption} \usepackage[german,english]{babel} \addto\captionsenglish{\renewcommand{\figurename}{Fig.}} \addto\captionsenglish{\renewcommand{\tablename}{Tab.}} \usepackage[separate-uncertainty=true, locale=DE]{siunitx} \sisetup{output-exponent-marker=\ensuremath{\mathrm{e}}} % \usepackage[capitalize]{cleveref} % \crefname{figure}{Fig.}{Figs.} % \crefname{equation}{Eq.}{Eqs.} % \creflabelformat{equation}{#2#1#3} \usepackage[ backend=biber, style=authoryear, pluralothers=true, maxcitenames=1, mincitenames=1 ]{biblatex} \addbibresource{cite.bib} %\bibdata %\bibstyle %\citation \title{Emergent intensity invariance vs. signal-to-noise ratio at three consecutive processing stages along the grasshopper song recognition pathway} \author{Jona Hartling, Jan Benda} \date{} \begin{document} \maketitle{} % Text references and citations: \newcommand{\bcite}[1]{\mbox{\cite{#1}}} % \newcommand{\fref}[1]{\mbox{\cref{#1}}} % \newcommand{\fref}[1]{\mbox{Fig.\,\ref{#1}}} % \newcommand{\eref}[1]{\mbox{\cref{#1}}} % \newcommand{\eref}[1]{\mbox{Eq.\,\ref{#1}}} % Subplot lettering: \newcommand{\figa}{\textbf{a}} \newcommand{\figb}{\textbf{b}} \newcommand{\figc}{\textbf{c}} \newcommand{\figd}{\textbf{d}} \newcommand{\fige}{\textbf{e}} % Math shorthands - Standard symbols: \newcommand{\dec}{\log_{10}} % Logarithm base 10 \newcommand{\infint}{\int_{-\infty}^{+\infty}} % Indefinite integral % Math shorthands - Spectral filtering: \newcommand{\bp}{h_{\text{BP}}(t)} % Bandpass filter function \newcommand{\lp}{h_{\text{LP}}(t)} % Lowpass filter function \newcommand{\hp}{h_{\text{HP}}(t)} % Highpass filter function \newcommand{\fc}{f_{\text{cut}}} % Filter cutoff frequency \newcommand{\tlp}{T_{\text{LP}}} % Lowpass filter averaging interval \newcommand{\thp}{T_{\text{HP}}} % Highpass filter adaptation interval % Math shorthands - Early representations: \newcommand{\raw}{x_{\text{raw}}} % Placeholder input signal \newcommand{\filt}{x_{\text{filt}}} % Bandpass filtered signal \newcommand{\env}{x_{\text{env}}} % Signal envelope \newcommand{\db}{x_{\text{log}}} % Logarithmically scaled signal \newcommand{\dbref}{x_{\text{ref}}} % Decibel reference intensity \newcommand{\adapt}{x_{\text{adapt}}} % Adapted signal % Math shorthands - Kernel parameters: \newcommand{\kw}{\sigma} % Unspecific Gabor kernel width \newcommand{\kf}{\omega} % Unspecific Gabor kernel frequency \newcommand{\kp}{\phi} % Unspecific Gabor kernel phase \newcommand{\kn}{n} % Unspecific Gabor kernel lobe number % \newcommand{\ks}{s} % Unspecific Gabor kernel sign \newcommand{\kwi}{\kw_i} % Specific Gabor kernel width \newcommand{\kfi}{\kf_i} % Specific Gabor kernel frequency \newcommand{\kpi}{\kp_i} % Specific Gabor kernel phase \newcommand{\kni}{\kn_i} % Specific Gabor kernel lobe number % \newcommand{\ksi}{\ks_i} % Specific Gabor kernel sign % Math shorthands - Auxiliary kernel parameters: \newcommand{\fsin}{f_{\text{sin}}} % Carrier frequency \newcommand{\rh}{h_{\text{rel}}} % Relative Gaussian height for FWRH \newcommand{\fwrh}{\text{FWRH}} % Gaussian full-width at relative height \newcommand{\off}{\beta_0} % Offset for linear frequency approximation % Math shorthands - Thresholding nonlinearity: \newcommand{\thr}{\Theta_i} % Step function threshold value \newcommand{\nl}{H(c_i\,-\,\thr)} % Shifted Heaviside step function % Math shorthands - Intensity invariance analysis: \newcommand{\soc}{s} % Song component of synthetic mixture \newcommand{\noc}{\eta} % Noise component of synthetic mixture \newcommand{\sca}{\alpha} % Multiplicative scale of song component \newcommand{\xvar}{\sigma_{x}^{2}} % Variance of synthetic mixture \newcommand{\svar}{\sigma_{\text{s}}^{2}} % Song component variance \newcommand{\nvar}{\sigma_{\eta}^{2}} % Noise component variance \newcommand{\pc}{p(c,\,T)} % Probability density (general interval) \newcommand{\pclp}{p(c,\,\tlp)} % Probability density (lowpass interval) \section{Exploring a grasshopper's sensory world} % Why functional models of sensory systems? Our scientific understanding of sensory processing systems results from the distributed accumulation of anatomical, physiological and ethological evidence. This process is undoubtedly without alternative; however, it leaves us with the challenge of integrating the available fragments into a coherent whole in order to address issues such as the interaction between individual system components, the functional limitations of the system overall, or taxonomic comparisons between systems that process the same sensory modality. Any unified framework that captures the essential functional aspects of a given sensory system thus has the potential to deepen our current understanding and fasciliate systematic investigations. However, building such a framework is a challenging task. It requires a wealth of existing knowledge of the system and the signals it operates on, a clearly defined scope, and careful reduction, abstraction, and formalization of the underlying structures and mechanisms. % Why the grasshopper auditory system? % Why focus on song recognition among other auditory functions? One sensory system about which extensive information has been gathered over the years is the auditory system of grasshoppers~(\textit{Acrididae}). Grasshoppers rely on their sense of hearing primarily for intraspecific communication, which includes mate attraction~(\bcite{helversen1972gesang}) and evaluation~(\bcite{stange2012grasshopper}), sender localization~(\bcite{helversen1988interaural}), courtship display~(\bcite{elsner1968neuromuskularen}), rival deterrence~(\bcite{greenfield1993acoustic}), and loss-of-signal predator alarm~(SOURCE). In accordance with this rich behavioral repertoire, grasshoppers have evolved a variety of sound production mechanisms to generate acoustic communication signals for different contexts and ranges using their wings, hindlegs, or mandibles~(\bcite{otte1970comparative}). Among the most conspicuous acoustic signals of grasshoppers are their species-specific calling songs, which broadcast the presence of the singing individual --- mostly the males of the species --- to potential mates within range. These songs are usually more characteristic of a species than morphological traits~(\bcite{tishechkin2016acoustic}; \bcite{tarasova2021eurasius}), which can vary greatly within species~(\bcite{rowell1972variable}; \bcite{kohler2017morphological}). The reliance on songs to mediate reproduction represents a strong evolutionary driving force, that resulted in a massive species diversification~(\bcite{vedenina2011speciation}; \bcite{sevastianov2023evolution}), with over 6800 recognized grasshopper species in the \textit{Acrididae} family~(\bcite{cigliano2024orthoptera}). It is this diversity of species, and the crucial role of acoustic communication in its emergence, that makes the grasshopper auditory system an intriguing candidate for attempting to construct a functional model framework. As a necessary reduction, the model we propose here focuses on the pathway responsible for the recognition of species-specific calling songs, disregarding other essential auditory functions such as directional hearing~(\bcite{helversen1984parallel}; \bcite{ronacher1986routes}; \bcite{helversen1988interaural}). % What are the signals the auditory system is supposed to recognize? % Why is intensity invariance important for song recognition? % (Obviously, split this paragraph) To understand the functional challenges faced by the grasshopper auditory system, one has to understand the properties of the songs it is designed to recognize. Grasshopper songs are amplitude-modulated broad-band acoustic signals. Most songs are produced by stridulation, during which the animal pulls the serrated stridulatory file on its hindlegs across a resonating vein on the forewings~(\bcite{helversen1977stridulatory}; \bcite{stumpner1994song}; \bcite{helversen1997recognition}). Every tooth that strikes the vein generates a brief pulse of sound. Multiple pulses make up a syllable; and the alternation of syllables and relatively quiet pauses forms a characteristic, through noisy, waveform pattern. Song recognition depends on certain temporal and structural parameters of this pattern, such as the duration of syllables and pauses~(\bcite{helversen1972gesang}), the slope of pulse onsets~(\bcite{helversen1993absolute}), and the accentuation of syllable onsets relative to the preceeding pause~(\bcite{balakrishnan2001song}; \bcite{helversen2004acoustic}). The amplitude modulation of the song is sufficient for recognition~(\bcite{helversen1997recognition}). However, the essential recognition cues can vary considerably with external physical factors, which requires the auditory system to be invariant to such variations in order to reliably recognize songs under different conditions. For instance, the temporal structure of grasshopper songs warps with temperature~(\bcite{skovmand1983song}). The auditory system can compensate for this variability by reading out relative temporal relationships rather than absolute time intervals~(\bcite{creutzig2009timescale}; \bcite{creutzig2010timescale}), as those remain relatively constant across different temperatures~(\bcite{helversen1972gesang}). Another, perhaps even more fundamental external source of song variability lays in the attenuation of sound intensity with increasing distance to the sender. Sound attenuation depends on both the frequency content of the signal and the vegetation of the habitat~(\bcite{michelsen1978sound}). For the receiving auditory system, this has two major implications. First, the amplitude dynamics of the song pattern are steadily degraded over distance, which limits the effective communication range of grasshoppers to~\mbox{1\,-\,2\,m} in their typical grassland habitats~(\bcite{lang2000acoustic}). Second, the overall intensity level of songs at the receiver's position varies depending on the location of the sender, which should ideally not affect the recognition of the song pattern. This neccessitates that the auditory system achieves a certain degree of intensity invariance --- a time scale-selective sensitivity to faster amplitude dynamics and simultaneous insensitivity to slower, more sustained amplitude dynamics. Intensity invariance in different auditory systems is often associated with neuronal adaptation~(\bcite{benda2008spike}; \bcite{barbour2011intensity}; \bcite{ozeri2018fast}; more general:~\bcite{benda2021neural}). In the grasshopper auditory system, a number of neuron types along the processing chain exhibit spike-frequency adaptation in response to sustained stimulus intensities~(\bcite{romer1976informationsverarbeitung}; \bcite{gollisch2004input}; \bcite{hildebrandt2009origin}; \bcite{clemens2010intensity}; \bcite{fisch2012channel}) and thus likely contribute to the emergence of intensity-invariant song representations. This means that intensity invariance is not the result of a single processing step but rather a gradual process, in which different neuronal populations contribute to varying degrees~(\bcite{clemens2010intensity}) and by different mechanisms~(\bcite{hildebrandt2009origin}). Approximating this process within a functional model framework thus requires a considerable amount of simplification. In this work, we demonstrate that even a small number of basic physiologically inspired signal transformations --- specifically, pairs of nonlinear and linear operations --- is sufficient to achieve a meaningful degree of intensity invariance. % How can song recognition be modelled functionally (feat. Jan Clemens & Co.)? % How did we expand on the previous framework? % (Still can't stand some of this paragraph's structure and wording...) Invariance to non-informative song variations is crucial for reliable song recognition; however, it is not sufficient to this end. In order to recognize a conspecific song as such, the auditory system needs to extract sufficiently informative features of the song pattern and then integrate the gathered information into a final categorical percept. Previous authors have proposed a functional model framework that describes this process --- feature extraction, evidence accumulation, and categorical decision making --- in both crickets~(\bcite{clemens2013computational}; \bcite{hennig2014time}) and grasshoppers~(\bcite{clemens2013feature}; review on both:~\bcite{ronacher2015computational}). Their framework provides a comprehensible and biologically plausible account of the computational mechanisms required for species-specific song recognition, which has served as the inspiration for the development of the model pathway we propose here. The existing framework relies on pulse trains as input signals, which were designed to capture the essential structural properties of natural song envelopes~(\bcite{clemens2013feature}). In the first step, a bank of parallel linear-nonlinear feature detectors is applied to the input signal. Each feature detector consists of a convolutional filter and a subsequent sigmoidal nonlinearity. The outputs of these feature detectors are temporally averaged to obtain a single feature value per detector, which is then assigned a specific weight. The linear combination of weighted feature values results in a single preference value, that serves as predictor for the behavioral response of the animal to the presented input signal. Our model pathway adopts the general structure of the existing framework but modifies it in several key aspects. The convolutional filters, which have previously been fitted to behavioral data for each individual species~(\bcite{clemens2013computational}), are replaced by a larger, generic set of unfitted Gabor basis functions in order to cover a wide range of possible song features across different species. Gabor functions approximate the general structure of the filters used in the existing framework as well as the filter functions found in various auditory neurons~(\bcite{rokem2006spike}; \bcite{clemens2011efficient}; \bcite{clemens2012nonlinear}). The fitted sigmoidal nonlinearities in the existing framework consistently exhibited very steep slopes and are therefore replaced by shifted Heaviside step-functions, which results in a binarization of the feature detector outputs. Another, more substantial modification is that the feature detector outputs are temporally averaged in a way that does not condense them into single feature values but retains their time-varying structure. This is in line with the fact that songs are no discrete units but part of a continuous acoustic stream that the auditory system has to process in real time. Moreover, a time-varying feature representation only stabilizes after a certain delay following the onset of a song, which emphasizes the temporal dynamics of evidence accumulation towards a final categorical decision. The most notable difference between our model pathway and the existing framework, however, lays in the addition of a physiologically inspired preprocessing stage, whose starting point corresponds to the initial reception of airborne sound waves. This allows the model to operate on unmodified recordings of natural grasshopper songs instead of condensed pulse train approximations, which widens its scope towards more realistic, ecologically relevant scenarios. For instance, we were able to investigate the contribution of different processing stages to the emergence of intensity-invariant song representations based on actual field recordings of songs at different distances from the sender. % Forgive me, it's friday. In the following, we outline the structure of the proposed model of the grasshopper auditory pathway, from the initial reception of sound waves up to the generation of a high-dimensional, time-varying feature representation that is suitable for species-specific song recognition. We provide a side-by-side account of the known physiological processing steps and their functional approximation by basic mathematical operations. We then elaborate on two key mechanisms that drive the emergence of intensity-invariant song representations within the auditory pathway. % SCRAPPED UNTIL FURTHER NOTICE: % Multi-species, multi-individual communally inhabited environments\\ % - Temporal overlap: Simultaneous singing across individuals/species common\\ % - Frequency overlap: Little speciation into frequency bands (likely unused)\\ % - "Biotic noise": Hetero-/conspecifics ("Another one's songs are my noise")\\ % - "Abiotic noise": Wind, water, vegetation, anthropogenic\\ % - Effects of habitat structure on sound propagation (landscape - soundscape)\\ % $\rightarrow$ Sensory constraints imposed by the (acoustic) environment % Cluster of auditory challenges (interlocking constraints $\rightarrow$ tight coupling):\\ % From continuous acoustic input, generate neuronal representations that...\\ % 1)...allow for the separation of relevant (song) events from ambient noise floor\\ % 2)...compensate for behaviorally non-informative song variability (invariances)\\ % 3)...carry sufficient information to characterize different song patterns, % recognize the ones produced by conspecifics, and make appropriate behavioral % decisions based on context (sender identity, song type, mate/rival quality) % How can the auditory system of grasshoppers meet these challenges?\\ % - What are the minimum functional processing steps required?\\ % - Which known neuronal mechanisms can implement these steps?\\ % - Which and how many stages along the auditory pathway contribute?\\ % $\rightarrow$ What are the limitations of the system as a whole? % How can a human observer conceive a grasshopper's auditory percepts?\\ % - How to investigate the workings of the auditory pathway as a whole?\\ % - How to systematically test effects and interactions of processing parameters?\\ % - How to integrate the available knowledge on anatomy, physiology, ethology?\\ % $\rightarrow$ Abstract, simplify, formalize $\rightarrow$ Functional model framework \section{Developing a functional model of the\\grasshopper song recognition pathway} % Too long (no splitting, only pruning). The essence of constructing a functional model of a given system is to gain a sufficient understanding of the system's essential structural components and their presumed functional roles; and to then build a formal framework of manageable complexity around these two aspects. Anatomically, the organization of the grasshopper song recognition pathway can be outlined as a feed-forward network of three consecutive neuronal populations~(Fig.\,\mbox{\ref{fig:pathway}a-c}): Peripheral auditory receptor neurons, whose axons enter the ventral nerve cord at the level of the metathoracic ganglion; local interneurons that remain exclusively within the thoracic region of the ventral nerve cord; and ascending neurons projecting from the thoracic region towards the supraesophageal ganglion~(\bcite{rehbein1974structure}; \bcite{rehbein1976auditory}; \bcite{eichendorf1980projections}). The input to the network originates at the tympanal membrane, which acts as acoustic receiver and is coupled to the dendritic endings of the receptor neurons~(\bcite{gray1960fine}). The outputs from the network converge in the supraesophageal ganglion, which is presumed to harbor the neuronal substrate for conspecific song recognition and response initiation~(\bcite{ronacher1986routes}; \bcite{bauer1987separate}; \bcite{bhavsar2017brain}). Functionally, the ascending neurons are the most diverse of the three populations along the pathway. Individual ascending neurons possess highly specific response properties that contrast with the rather homogeneous response properties of the preceding receptor neurons and local interneurons~(\bcite{clemens2011efficient}), indicating a transition from a uniform population-wide processing stream into several parallel branches. Based on these anatomical and physiological considerations, the overall structure of the model pathway is divided into two distinct stages~(Fig.\,\ref{fig:pathway}d). The preprocessing stage incorporates the known physiological processing steps at the levels of the tympanal membrane, the receptor neurons, and the local interneurons; and operates on one-dimensional signal representations. The feature extraction stage corresponds to the processing within the ascending neurons and further downstream towards the supraesophageal ganglion; and operates on high-dimensional signal representations. The details of each physiological processing step and its functional approximation within the two stages are outlined in the following sections. \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_auditory_pathway.pdf} \caption{\textbf{Schematic organisation of the grasshopper song recognition pathway and structure of the functional model pathway.} \textbf{a}:~Simplified course of the pathway in the grasshopper, from the tympanal membrane over receptor neurons, local interneurons, and ascending neurons further towards the supraesophageal ganglion. \textbf{b}:~Schematic of synaptic connections between the three neuronal populations within the metathoracic ganglion. \textbf{c}:~Network representation of neuronal connectivity. \textbf{d}:~Flow diagram of consecutive signal representations~(boxes) and transformations~(arrows) along the model pathway. All representations are time-varying. 1st half: Preprocessing stage~(one-dimensional representation). 2nd half: Feature extraction stage~(high-dimensional representation). } \label{fig:pathway} \end{figure} \subsection{Population-driven signal preprocessing} Grasshoppers receive airborne sound waves by a tympanal organ at either side of the body. The tympanal membrane acts as a mechanical resonance filter for sound-induced vibrations~(\bcite{windmill2008time}; \bcite{malkin2014energy}). Vibrations that fall within specific frequency bands are focused on different membrane areas, while others are attenuated. This processing step can be approximated by an initial bandpass filter \begin{equation} \filt(t)\,=\,\raw(t)\,*\,\bp, \qquad \fc\,=\,5\,\text{kHz},\,30\,\text{kHz} \label{eq:bandpass} \end{equation} applied to the acoustic input signal $\raw(t)$. The auditory receptor neurons transduce the vibrations of the tympanal membrane into sequences of action potentials. Thereby, they encode the amplitude modulation, or envelope, of the signal~(\bcite{machens2001discrimination}), which likely involves a rectifying nonlinearity~(\bcite{machens2001representation}). This can be modelled as full-wave rectification followed by lowpass filtering \begin{equation} \env(t)\,=\,|\filt(t)|\,*\,\lp, \qquad \fc\,=\,250\,\text{Hz} \label{eq:env} \end{equation} of the tympanal signal $\filt(t)$. Furthermore, the receptors exhibit a sigmoidal response curve over logarithmically compressed intensity levels~(\bcite{suga1960peripheral}; \bcite{gollisch2002energy}). In the model pathway, logarithmic compression is achieved by conversion to decibel scale \begin{equation} \db(t)\,=\,20\,\cdot\,\dec \frac{\env(t)}{\dbref}, \qquad \dbref\,=\,1 \label{eq:log} \end{equation} relative to the common reference intensity $\dbref$. Both the receptor neurons~(\bcite{romer1976informationsverarbeitung}; \bcite{gollisch2004input}; \bcite{fisch2012channel}) and, on a larger scale, the subsequent local interneurons~(\bcite{hildebrandt2009origin}; \bcite{clemens2010intensity}) adapt their firing rates in response to sustained stimulus intensity levels, which allows for the robust encoding of faster amplitude modulations against a slowly changing overall baseline intensity. Functionally, the adaptation mechanism resembles a highpass filter \begin{equation} \adapt(t)\,=\,\db(t)\,*\,\hp, \qquad \fc\,=\,10\,\text{Hz} \label{eq:highpass} \end{equation} over the logarithmically scaled envelope $\db(t)$. This processing step concludes the preprocessing stage of the model pathway. The resulting intensity-adapted envelope $\adapt(t)$ is then passed on from the local interneurons to the ascending neurons, where it serves as the basis for the following feature extraction stage. % Cite somewhere: \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_pre_stages.pdf} \caption{\textbf{Representations of a song of \textit{O. rufipes} during the preprocessing stage.} \textbf{a}:~Bandpass filtered tympanal signal $\filt(t)$. \textbf{b}:~Signal envelope $\env(t)$. \textbf{c}:~Logarithmically compressed envelope $\db(t)$. \textbf{d}:~Intensity-adapted envelope $\adapt(t)$. } \label{fig:stages_pre} \end{figure} \FloatBarrier \subsection{Feature extraction by individual neurons} The ascending neurons extract and encode a number of different features of the preprocessed signal. As a population, they hence represent the signal in a higher-dimensional space than the preceding receptor neurons and local interneurons. Each ascending neuron is assumed to scan the signal for a specific template pattern, which can be thought of as a kernel of a particular structure and on a particular time scale. This process, known as template matching, can be modelled as a convolution \begin{equation} c_i(t)\,=\,\adapt(t)\,*\,k_i(t) = \infint \adapt(\tau)\,\cdot\,k_i(t\,-\,\tau)\,d\tau \label{eq:conv} \end{equation} of the intensity-adapted envelope $\adapt(t)$ with a kernel $k_i(t)$ per ascending neuron. We use Gabor kernels as basis functions for creating different template patterns. An arbitrary one-dimensional, real Gabor kernel is generated by multiplication of a Gaussian envelope and a sinusoidal carrier \begin{equation} k_i(t,\,\kwi,\,\kfi,\,\kpi)\,=\,e^{-\frac{t^{2}}{2{\kwi}^{2}}}\,\cdot\,\sin(\kfi\,t\,+\,\kpi), \qquad \kfi\,=\,2\pi\fsin \label{eq:gabor} \end{equation} with Gaussian standard deviation or kernel width $\kwi$, carrier frequency $\kfi$, and carrier phase $\kpi$. Different combinations of $\kw$ and $\kf$ result in Gabor kernels with different lobe number $\kn$, which is the number of half-periods of the carrier that fit under the Gaussian envelope within reasonable limits of attenuation. The interval under the Gaussian envelope that contains the relevant lobes of the kernel can be defined as Gaussian full-width measured at relative peak height $\rh$ \begin{equation} \fwrh(\kw,\,\rh)\,=\,2\,\cdot\,\sqrt{-2\,\cdot\,\ln \rh}\cdot\,\kw, \qquad \rh\,\in\,(0,\,1] \end{equation} With this, an appropriate carrier frequency $\kf$ for obtaining a Gabor kernel with width $\kw$ and desired lobe number $\kn$ can be approximated as % \begin{equation} % \kf(\kn,\,\fwrh)\,=\,\frac{0.5\,\cdot\,\kn\,+\,\off}{\fwrh}, \qquad \kn\,\geq\,2\enspace\forall\enspace \kn\,\in\,\mathbb{Z} % \end{equation} \begin{equation} \kf(\kn,\,\kw,\,\rh)\,=\,\frac{\kn\,+\,\off}{4\,\cdot\,\sqrt{-2\,\cdot\,\ln \rh}}, \qquad \kn\,\geq\,2\enspace\forall\enspace \kn\,\in\,\mathbb{Z} \end{equation} where $\off$ is a small positive offset to the near-linear relationship between $\kf$ and $\kn$ to balance the amplitude of the $\kn$ desired lobes of the kernel --- which should be maximized --- against the amplitude of the next-outer lobes, which should not exceed the threshold value determined by $\rh$. For $\kn=1$, carrier frequency $\kf$ is set to zero, which results in a simple Gaussian kernel. Carrier phase $\kp$ determines the position of the kernel lobes relative to the kernel center. By setting $\kp$ to one of only four specific phase values~(Tab.\,\ref{tab:gabor_phases}), we restrict the Gabor kernels to be either even functions~(mirror-symmetric, uneven $\kn$) or odd functions~(point-symmetric, even $\kn$) with either positive or negative sign, which refers to the sign of the kernel's central lobe (even kernels) or the left of the two central lobes (odd kernels). \FloatBarrier \begin{table}[!ht] \centering \captionsetup{width=.46\textwidth} \caption{Values of phase $\kp$ that are specific for the four major groups of Gabor kernels.} \begin{tabular}{|ccc|} \hline sign & even kernels & odd kernels\\ \hline $+$ & $+\pi\,/\,2$ & $\pi$\\ $-$ & $-\pi\,/\,2$ & $0$\\ \hline \end{tabular} \label{tab:gabor_phases} \end{table} \FloatBarrier These four major groups of Gabor kernels allow for the extraction of different types of signal features, such as the presence of peaks (even, $+$), troughs (even, $-$), onsets (odd, $+$), and offsets (odd, $-$) at various time scales. % Add kernel normalization here. Following the convolutional template matching, each kernel-specific response $c_i(t)$ is passed through a shifted Heaviside step-function $\nl$ with threshold value $\thr$ to obtain a binary response \begin{equation} b_i(t,\,\thr)\,=\,\begin{cases} \;1, \quad c_i(t)\,>\,\thr\\ \;0, \quad c_i(t)\,\leq\,\thr \end{cases} \label{eq:binary} \end{equation} which can be thought of as a categorization into "relevant" and "irrelevant" response values. In the grasshopper, these thresholding nonlinearities might either be part of the processing within the ascending neurons or take place further downstream~(SOURCE). Finally, the responses of the ascending neurons are assumed to be integrated somewhere in the supraesophageal ganglion~(\bcite{ronacher1986routes}; \bcite{bauer1987separate}; \bcite{bhavsar2017brain}). This processing step can be approximated as temporal averaging of the binary responses $b_i(t)$ by a lowpass filter \begin{equation} f_i(t)\,=\,b_i(t)\,*\,\lp, \qquad \fc\,=\,1\,\text{Hz} \label{eq:lowpass} \end{equation} to obtain a final set of slowly changing kernel-specific features $f_i(t)$. In the resulting high-dimensional feature space, different species-specific song patterns are characterized by a distinct combination of feature values, which can be read out by a simple linear classifier. % Cite somewhere: \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_feat_stages.pdf} \caption{\textbf{Representations of a song of \textit{O. rufipes} during the feature extraction stage.} Different color shades indicate different types of Gabor kernels with specific lobe number $\kn$ and either $+$ or $-$ sign, sorted (dark to light) first by increasing $\kn$ and then by sign~($1\,\leq\,\kn\,\leq\,4$; first $+$, then $-$ for each $\kn$; two kernel widths $\kw$ of $4\,$ms and $32\,$ms per type; 8 types, 16 kernels in total). \textbf{a}:~Kernel-specific filter responses $c_i(t)$. \textbf{b}:~Binary responses $b_i(t)$. \textbf{c}:~Finalized features $f_i(t)$.} \label{fig:stages_feat} \end{figure} \FloatBarrier \section{Mechanisms driving the emergence of\\intensity-invariant song representation} % Still missing the SNR analysis. Should be able to write around it for now. The robustness of song recognition is tied to the degree of intensity invariance of the finalized feature representation. Ideally, the values of each feature should depend only on the relative amplitude dynamics of the song pattern but not on the overall intensity of the song. In the grasshopper, the emergence of intensity-invariant representations along the song recognition pathway likely is a distributed process that involves different neuronal populations, which raises the question of what the essential computational mechanisms are that drive this process. Within the model pathway, we identified two key mechanisms that render the song representation more invariant to intensity variations. The two mechanisms each comprise a nonlinear signal transformation followed by a linear signal transformation but differ in the specific operations involved, as outlined in the following sections. \subsection{Full-wave rectification \& lowpass filtering} The first nonlinear transformation along the model pathway is the full-wave rectification of the tympanal signal $\filt(t)$ during the extraction of the signal envelope (Eq.\,\ref{eq:env}). Rectification transforms the distribution of $\filt(t)$ from an approximately zero-centered distribution with both positive and negative values into a strictly non-negative distribution. Signal envelope $\env(t)$ is then obtained by lowpass filtering the rectified $\filt(t)$. The effects of this transformation pair on SNR and potential intensity invariance were analyzed by rescaling and processing the input signal $\raw(t)$ and comparing standard deviations between the resulting $\filt(t)$ and $\env(t)$, once for the noiseless case~(Fig.\,\ref{fig:rect-lp}a) and once for the noisy case~(Fig.\,\ref{fig:rect-lp}b). In addition, the cutoff frequency $\fc$ of the lowpass filter was varied to investigate the influence of different filter bandwidths. In the noiseless case, the standard deviations of $\filt(t)$ and $\env(t)$ are each reduced compared to the input $\raw(t)$ by a multiplicative factor. These factors are constant across all $\sca$, which results in a downward shift of the respective curve on a double-logarithmic scale, away from the diagonal~(Fig.\,\ref{fig:rect-lp}c). For $\filt(t)$, the reduction is a consequence of the bandpass filtering~(Eq.\,\ref{eq:bandpass}) of $\raw(t)$. For $\env(t)$, the standard deviation is further reduced compared to $\filt(t)$. Rectification contributes much less to this reduction than lowpass filtering. The degree of reduction by lowpass filtering depends on the cutoff frequency $\fc$, with lower $\fc$ (narrow bandwidth) resulting in a stronger reduction. In the noisy case, the standard deviations of $\filt(t)$ and $\env(t)$ can be related to the respective pure-noise reference standard deviation~(Fig.\,\ref{fig:rect-lp}d). This causes each curve to start with a constant regime of SNR values near 1 for smaller $\sca$, which reflects the dominance of the noise component $\noc(t)$ over the song component $\soc(t)$ in the input $\raw(t)$. For larger $\sca$, all curves transition into a regime of linearly increasing SNR on a double-logarithmic scale. For $\filt(t)$, the linear part of the curve deviates only slightly from the diagonal. For $\env(t)$, however, the transition occurs at lower $\sca$ compared to $\filt(t)$, and the linear part of the curve is shifted leftward away from the diagonal, which means that higher SNR values are achieved for the same $\sca$. This effect is more pronounced for lower $\fc$ of the lowpass filter and is presumably caused by the attenuation of high-frequency components in the signal, which are more prominent in the noise component $\noc(t)$ than in the song component $\soc(t)$. The effect also appears relatively consistent across different species, although small variations exist~(Fig.\,\ref{fig:rect-lp}e) that are presumably based on different song structures and frequency spectra. In summary, the standard deviation of $\env(t)$ has never been observed to transition into a saturation regime for larger $\sca$ but rather continues to increase proportionally to $\sca$ for all tested $\fc$, in both the noiseless and the noisy case and across different species. Consequently, the combination of rectification and lowpass filtering does not contribute to intensity invariance. However, this transformation pair does improve the SNR of $\env(t)$ relative to $\filt(t)$ and thus provides subsequent processing stages with a more robust input representation and higher input SNR. \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_invariance_rect_lp.pdf} \caption{\textbf{Rectification and lowpass filtering improves SNR but does not contribute to intensity invariance.} Input $\raw(t)$ consists of song component $\soc(t)$ scaled by $\sca$ with optional noise component $\noc(t)$ and is successively transformed into tympanal signal $\filt(t)$ and envelope $\env(t)$. Different line styles indicate different cutoff frequencies $\fc$ of the lowpass filter extracting $\env(t)$. \textbf{Top}:~Example representations of $\filt(t)$ and $\env(t)$ for different $\sca$. \textbf{a}:~Noiseless case. \textbf{b}:~Noisy case. \textbf{Bottom}:~Intensity metrics over a range of $\sca$. \textbf{c}:~Noiseless case: Standard deviations $\sigma_x$ of $\filt(t)$ and $\env(t)$. \textbf{d}:~Noisy case: Ratios of $\sigma_x$ of $\filt(t)$ and $\env(t)$ to the respective reference standard deviation $\sigma_{\eta}$ for input $\raw(t)=\noc(t)$. \textbf{e}:~Ratios of $\sigma_x$ to $\sigma_{\eta}$ of $\env(t)$ as in \textbf{d} for different species (averaged over songs and recordings, see appendix Fig.\,\ref{fig:app_rect-lp}). } \label{fig:rect-lp} \end{figure} \FloatBarrier \subsection{Logarithmic compression \& spike-frequency adaptation} The second nonlinear transformation along the model pathway is the logarithmic compression of the signal envelope $\env(t)$ into $\db(t)$, Eq.\,\ref{eq:log}, which is then followed by the highpass filtering of $\db(t)$, Eq.\,\ref{eq:highpass}, to obtain the intensity-adapted envelope $\adapt(t)$. The interplay of this transformation pair was analyzed by rescaling and processing the input signal $\filt(t)$ and comparing standard deviations between the resulting $\env(t)$, $\db(t)$, and $\adapt(t)$. It is necessary to use $\filt(t)$ as input for this analysis instead of $\env(t)$, because $\env(t)$ results from a nonlinear transformation and hence cannot be synthesized as an additive mixture of song component $\soc(t)$ and noise component $\noc(t)$. % <-- Sentence may be methods section material. However, it is much easier to conceive a mathematical description of the effects of logarithmic compression and adaptation if $\env(t)$ itself is assumed to be composed of $\soc(t)$ and $\noc(t)$. In the noiseless case~(Fig.\,\ref{fig:log-hp}a), $\env(t)$ takes the form of \begin{equation} \env(t)\,=\,\sca\,\cdot\,\soc(t), \qquad \env(t)\,>\,0\enspace\forall\enspace t\,\in\,\mathbb{R} \label{eq:toy_env_pure} \end{equation} The standard deviation of $\env(t)$ increases linearly with $\sca$ on a double-logarithmic scale and is slightly reduced~(Fig.\,\ref{fig:log-hp}c) compared to the input $\filt(t)$, which is consistent with the results of the previous analysis~(Fig.\,\ref{fig:rect-lp}c). By conversion of $\env(t)$ to decibel scale, $\sca$ turns from a multiplicative scale in linear space into an additive term, or offset, in logarithmic space: \begin{equation} \db(t)\,=\,20\,\cdot\,\dec \left[\,\sca\,\cdot\,s(t)\,\right]\,=\,20\,\cdot\,\left[\dec \sca\,+\,\dec s(t)\right], \qquad \sca\,>\,0 \label{eq:toy_log_pure} \end{equation} The highpass filtering of $\db(t)$ can be approximated as a subtraction of the local signal offset within a suitable time interval $0 \ll \thp < \frac{1}{\fc}$: \begin{equation} \begin{split} \adapt(t)\,\approx\,\db(t)\,-\,20\,\cdot\,\dec \sca\,=\,20\,\cdot\,\dec s(t) \end{split} \label{eq:toy_highpass_pure} \end{equation} This eliminates $\sca$ from $\adapt(t)$ and thus renders it perfectly intensity-invariant, with a constant standard deviation of around 10\,dB across all $\sca>0$~(Fig.\,\ref{fig:log-hp}c). In contrast, in the noisy case~(Fig.\,\ref{fig:log-hp}b), $\env(t)$ takes the form of \begin{equation} \env(t)\,=\,\sca\,\cdot\,\soc(t)\,+\,\noc(t), \qquad \env(t)\,>\,0\enspace\forall\enspace t\,\in\,\mathbb{R} \label{eq:toy_env_noise} \end{equation} Similar to the previous analysis~(Fig.\,\ref{fig:rect-lp}d), the ratio of the standard deviation of $\env(t)$ to its pure-noise reference standard deviation on a double-logarithmic scale follows a constant regime for small $\sca$ and a linearly increasing regime for larger $\sca$~(Fig.\,\ref{fig:log-hp}d). Decibel conversion of $\env(t)$ % \begin{equation} % \begin{split} % \db(t)\,&=\,20\,\cdot\,\dec \left[\,\sca\,\cdot\,s(t)\,+\,\eta(t)\,\right]\\ % &=\,20\,\cdot\,\left(\dec \sca\,+\,\dec \left[s(t)\,+\,\frac{\eta(t)}{\sca}\right]\right), \qquad \sca\,>\,0 % \end{split} % \label{eq:toy_log_noise} % \end{equation} \begin{equation} \db(t)\,=\,20\,\cdot\,\left(\dec \sca\,+\,\dec \left[s(t)\,+\,\frac{\eta(t)}{\sca}\right]\right), \qquad \sca\,>\,0 \label{eq:toy_log_noise} \end{equation} allows for the separation of $\sca$ from $\soc(t)$ but introduces a scaling of $\noc(t)$ by the inverse of $\sca$, which remains present even after the offset subtraction: \begin{equation} \begin{split} \adapt(t)\,\approx\,20\,\cdot\,\dec\left[s(t)\,+\,\frac{\eta(t)}{\sca}\right] \end{split} \label{eq:toy_highpass_noise} \end{equation} % \begin{equation} % \begin{split} % \adapt(t)\,\approx\,\db(t)\,-\,20\,\cdot\,\dec \sca\,=\,20\,\cdot\,\dec\left[s(t)\,+\,\frac{\eta(t)}{\sca}\right] % \end{split} % \label{eq:toy_highpass_noise} % \end{equation} This means that, in the noisy case, $\sca$ cannot be entirely eliminated from $\adapt(t)$, only redistributed between $\soc(t)$ and $\noc(t)$. If $\sca$ is sufficiently large ($\sca\gg1$, saturation regime), $\noc(t)$ is attenuated to the point of being negligible, so that $\adapt(t)$ is a scale-free representation of $\soc(t)$. If $\sca$ and $\noc(t)$ are at similar scales ($\sca\approx1$, transient regime), $\adapt(t)$ largely resembles $\db(t)$. Finally, if $\sca$ is sufficiently small ($0<\sca\ll1$, noise regime), $\noc(t)$ masks $\soc(t)$ even after the intensity adaptation. Accordingly, the effective intensity invariance of $\adapt(t)$ through logarithmic compression and adaptation is limited by the SNR of $\env(t)$: Songs that have already sunken into the noise floor at the level of $\env(t)$ cannot be recovered by subsequent processing steps, which emphasizes the importance of the SNR improvement by rectification and lowpass filtering during the previous processing step~(Fig.\,\ref{fig:rect-lp}d). The general pattern of noise regime, transient regime, and saturation regime remains consistent across different species~(Fig.\,\ref{fig:log-hp}e). However, the specific value of $\sca$ at which the saturation regime is reached (see appendix Fig.\,\ref{fig:app_log-hp_saturation}) and the maximum SNR value of $\adapt(t)$ within the saturation regime vary considerably between and within species. For example, \textit{C. biguttulus} and \textit{C. mollis} display a noticably lower maximum SNR of $\adapt(t)$ compared to other species. These differences are not to be underestimated, since the SNR of $\adapt(t)$ within the saturation regime determines the maximum input SNR for subsequent processing steps. In other words, the fact that $\adapt(t)$ eventually reaches a saturation regime is, of course, desirable in the context of intensity invariance, but it also means to pass up on the higher SNR values that are achieved by $\env(t)$ for the same $\sca$ (up to several orders of magnitude, Fig.\,\ref{fig:log-hp}d). This trade-off between intensity invariance and SNR is a recurring phenomenon that is further addressed in the following sections. \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_invariance_log_hp.pdf} \caption{\textbf{Intensity invariance through logarithmic compression and adaptation is restricted by the noise floor and decreases SNR.} Input $\filt(t)$ consists of song component $\soc(t)$ scaled by $\sca$ with optional noise component $\noc(t)$ and is successively transformed into envelope $\env(t)$, logarithmically compressed envelope $\db(t)$, and intensity-adapted envelope $\adapt(t)$. \textbf{Top}:~Example representations of $\env(t)$, $\db(t)$, and $\adapt(t)$ for different $\sca$. \textbf{a}:~Noiseless case. \textbf{b}:~Noisy case. \textbf{Bottom}:~Intensity metrics over a range of $\sca$. \textbf{c}:~Noiseless case: Standard deviations $\sigma_x$ of $\env(t)$, $\db(t)$, and $\adapt(t)$. \textbf{d}:~Noisy case: Ratios of $\sigma_x$ of $\env(t)$, $\db(t)$, and $\adapt(t)$ to the respective reference standard deviation $\sigma_{\eta}$ for input $\filt(t)=\noc(t)$. Shaded areas indicate $5\,\%$ (dark grey) and $95\,\%$ (light grey) curve span for $\adapt(t)$. \textbf{e}:~Ratios of $\sigma_x$ to $\sigma_{\eta}$ of $\adapt(t)$ as in \textbf{d} for different species (averaged over songs and recordings, see appendix Fig\,\ref{fig:app_log-hp_curves}). Dots indicate $95\,\%$ curve span per species. } \label{fig:log-hp} \end{figure} \FloatBarrier \subsection{Thresholding nonlinearity \& temporal averaging} The third nonlinear transformation along the model pathway is the thresholding nonlinearity $\nl$ that transforms each kernel response $c_i(t)$ into a binary binary response $b_i(t)$, Eq.\,\ref{eq:binary}. This transformation takes place after the convolutional filtering of $\adapt(t)$ with kernel $k_i(t)$, Eq.\,\ref{eq:conv}, and is followed by the temporal averaging of $b_i(t)$ into the feature set $f_i(t)$ by a lowpass filter, Eq.\,\ref{eq:lowpass}. The effects of thresholding and temporal averaging are best illustrated based on a single kernel~(Fig.\,\ref{fig:thresh-lp_single}) instead of the full set. For this analysis, input $\adapt(t)$ was rescaled~(Fig.\,\ref{fig:thresh-lp_single}a) and convolved with kernel $k(t)$. The resulting kernel response $c(t)$ was passed through $H(c\,-\,\Theta)$ with three different threshold values $\Theta$~(Fig.\,\ref{fig:thresh-lp_single}b-d). Each resulting binary response $b(t)$ was transformed into $f(t)$, whose average feature value serves as a measure of intensity~(Fig.\,\ref{fig:thresh-lp_single}ef). The thresholding nonlinearity $H(c\,-\,\Theta)$ categorizes the values of $c(t)$ into "relevant" ($c(t)>\Theta$, $b(t)=1$) and "irrelevant" ($c(t)\leq\Theta$, $b(t)=0$) response values. It thereby splits the probability density $\pc$ of $c(t)$ within some observed time interval $T$ into two complementary parts around $\Theta$: \begin{equation} \int_{\Theta}^{+\infty} \pc\,dc\,=\,1\,-\,\int_{-\infty}^{\Theta} \pc\,dc\,=\,\frac{T_1}{T}, \qquad \infint \pc\,dc\,=\,1 \label{eq:pdf_split} \end{equation} The right-sided part of the split $\pc$ corresponds to time $T_1$ where $c(t)>\Theta$, while the left-sided part corresponds to time $T_0=T-T_1$ where $c(t)\leq\Theta$. The semi-definite integral over the right-sided part of $\pc$ represents the ratio of time $T_1$ to total time $T$ because the indefinite integral of a probability density is normalized to 1. The lowpass filtering of $b(t)$ can be approximated as temporal averaging over a suitable time interval $\tlp>\frac{1}{\fc}$ in order to express $f(t)$ as a similar temporal ratio \begin{equation} f(t)\,\approx\,\frac{1}{\tlp} \int_{t}^{t\,+\,\tlp} b(\tau)\,d\tau\,=\,\frac{T_1}{\tlp}, \qquad b(t)\,\in\,\{0,\,1\} \label{eq:feat_avg} \end{equation} of time $T_1$ during which $b(t)$ is 1 within the averaging interval $\tlp$. Therefore, the value of $f(t)$ at every time point $t$ approximately signifies the cumulative probability that $c(t)$ exceeds $\Theta$ during the corresponding averaging interval $\tlp$: \begin{equation} f(t)\,\approx\,\int_{\Theta}^{+\infty} \pclp\,dc\,=\,P(c\,>\,\Theta,\,\tlp) \label{eq:feat_prop} \end{equation} In a sense, $f(t)$ can be interpreted as some sort of duty cycle with respect to $\Theta$. For example, a feature value of $f(t)=0.4$ means that $c(t)$ exceeds $\Theta$ for approximately 40\,\% of the time within $\tlp$ around $t$. In the most extreme cases, $\Theta$ lays either above the maximum of $c(t)$ or below the minimum of $c(t)$, which results in a minimum or maximum possible feature value of $f(t)=0$~(Fig.\,\ref{fig:thresh-lp_single}d, left column) or $f(t)=1$, respectively. Importantly, $f(t)$ neither retains information about the timing of individual threshold crossings nor the precise values of $c(t)$ apart from their relation to $\Theta$. Accordingly, for a given $\Theta$, different $\sca$ can still result in similar $T_1$ segments (and hence similar feature values) depending on the magnitude of the derivative of $c(t)$ in temporal proximity to time points at which $c(t)$ crosses $\Theta$: The steeper the slope of $c(t)$, the less $T_1$ changes with variations in $\sca$. The most reliable way of exploiting this invariant porperty of $f(t)$ is to set $\Theta$ to a value near 0, because these values are least affected by different scales of $c(t)$. For sufficiently large $\sca$, $f(t)$ then approaches the same constant value in both the noiseless and the noisy case~(Fig.\,\ref{fig:thresh-lp_single}e, saturation regime). The value of $f(t)$ in the saturation regime is independent of the precise value of $\Theta$, but the value of $\sca$ at which the saturation regime is reached decreses with $\Theta$~(Fig.\,\ref{fig:thresh-lp_single}e). Therefore, a threshold value of $\Theta=0$ would be the optimal choice for achieving intensity invariance at the lowest possible $\sca$. In stark contrast, the closer $\Theta$ is to 0, the higher the pure-noise response of $f(t)$ and the lower the resulting SNR of $f(t)$ between noise regime and saturation regime~(Fig.\,\ref{fig:thresh-lp_single}b-d, left column, and Fig.\,\ref{fig:thresh-lp_single}e). It is even possible to achieve an "unlimited" SNR of $f(t)$ by setting $\Theta$ above the maximum of the pure-noise $c(t)$, so that any value of $f(t)$ greater than 0 indicates the presence of the song component $\soc(t)$ in input $\adapt(t)$ at the cost of requiring a higher $\sca$ to reach the saturation regime. This trade-off between intensity invariance and SNR has already been observed during the previous analysis on logarithmic compression and adaptation~(Fig.\,\ref{fig:log-hp}d). However, the parameters that determine the SNR of $\adapt(t)$ are much less understood and likely relate to properties of the signal, whereas the SNR of $f(t)$ depends on the choice of $\Theta$ and can be more directly manipulated by the system. Finally, the effects of thresholding and temporal averaging must be seen in the context of the previous transformation pair of logarithmic compression and adaptation. Finally, the question remains whether the intensity-invariant output $\adapt(t)$ of the previous transformation pair allows feature Finally, the output $\adapt(t)$ of the previous transformation pair~(Fig.\,\ref{fig:log-hp}cd) can be related to the input $\adapt(t)$ of the current transformation pair by plotting the values of $f(t)$ over the standard deviation of input $\adapt(t)$ instead of $\sca$~(Fig.\,\ref{fig:thresh-lp_single}f). This is relevant because, unlike $\sca$, the standard deviation of $\adapt(t)$ is capped to a maximum value of around 10\,dB by the previous transformation pair~(Fig.\,\ref{fig:log-hp}cd) \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_invariance_thresh_lp_single.pdf} \caption{\textbf{Intensity invariance through thresholding and temporal averaging is mediated by the interaction of threshold value and noise floor.} Input $\adapt(t)$ consists of song component $\soc(t)$ scaled by $\sca$ with optional noise component $\noc(t)$ and is transformed into single kernel response $c(t)$, binary response $b(t)$, and feature $f(t)$. Different color shades indicate different threshold values $\Theta$ (multiples of reference standard deviation $\sigma_{\eta}$ of $c(t)$ for input $\adapt(t)=\noc(t)$, with darker colors for higher $\Theta$). \textbf{Left}:~Noisy case: Example representations of $\adapt(t)$ as well as $c(t)$, $b(t)$, and $f(t)$ for different $\sca$. \textbf{a}:~$\adapt(t)$ with kernel $k(t)$ in black. \textbf{b\,-\,d}: $c(t)$, $b(t)$, and $f(t)$ based on the same $\adapt(t)$ from \textbf{a} but with different $\Theta$. \textbf{Right}:~Average value $\mu_f$ of $f(t)$ for each $\Theta$ from \textbf{b\,-\,d}. Dots indicate $95\,\%$ curve span (noisy case). \textbf{e}:~$\mu_f$ over a range of $\sca$, once for the noisy case (solid lines) and once for the noiseless case (dotted lines). \textbf{f}:~Noisy case: $\mu_f$ over the standard deviation of input $\adapt$ corresponding to the values of $\sca$ shown in \textbf{e}. Shaded area indicates standard deviations that would be capped in the output $\adapt(t)$ of the previous transformation pair (see Fig.\,\ref{fig:log-hp}cd). } \label{fig:thresh-lp_single} \end{figure} \FloatBarrier \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_invariance_thresh_lp_species.pdf} \caption{\textbf{Feature representation of different species-specific songs saturates at different points in feature space.} Same input and processing as in Fig.\,\ref{fig:thresh-lp_single} but with three different kernels $k_i$, each with a single kernel-specific threshold value $\thr=0.5\cdot\sigma_{\eta_i}$. \textbf{a}:~Examples of species-specific grasshopper songs. \textbf{Middle}:~Average value $\mu_{f_i}$ of each feature $f_i(t)$ over $\sca$ per species (averaged over songs and recordings, see appendix Figs.\,\ref{fig:app_thresh-lp_pure} and \ref{fig:app_thresh-lp_noise}). Different color shades indicate different kernels $k_i$. Dots indicate $95\,\%$ curve span per $k_i$. \textbf{b}:~Noiseless case. \textbf{c}:~Noisy case. \textbf{Bottom}:~2D feature spaces spanned by each pair of $f_i(t)$. Each trajectory corresponds to a species-specific combination of $\mu_{f_i}$ that develops with $\sca$ (colorbars). Horizontal dashes in the colorbar indicate $5\,\%$ (dark grey) and $95\,\%$ (light grey) curve span of the norm across all three $\mu_{f_i}$ per species. \textbf{d}:~Noiseless case. \textbf{e}:~Noisy case. Shaded areas indicate the average minimum $\mu_{f_i}$ across all species-specific trajectories. } \label{fig:thresh-lp_species} \end{figure} \FloatBarrier % \caption{\textbf{Rectification and lowpass filtering improves SNR % but does not contribute to intensity invariance.} % Input $\raw(t)$ consists of song component $\soc(t)$ scaled by % $\sca$ with optional noise component $\noc(t)$ and is % successively transformed into tympanal signal $\filt(t)$ and % envelope $\env(t)$. Different line styles indicate different % cutoff frequencies $\fc$ of the lowpass filter extracting % $\env(t)$. % \textbf{Top}:~Example representations of $\filt(t)$ and % $\env(t)$ for different $\sca$. % \textbf{a}:~Noiseless case. % \textbf{b}:~Noisy case. % \textbf{Bottom}:~Intensity metrics over a range of $\sca$. % \textbf{c}:~Noiseless case: Standard deviations of $\filt(t)$ % and $\env(t)$. % \textbf{d}:~Noisy case: Ratios of standard deviations of % $\filt(t)$ and $\env(t)$ to the respective reference standard % deviation for input $\raw(t)=\noc(t)$. % \textbf{e}:~Ratios of standard deviations of $\env(t)$ as in % \textbf{b} for different species (averaged over songs and % recordings, see appendix Fig.\,\ref{fig:app_rect-lp}). % } \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_invariance_full_Omocestus_rufipes.pdf} \caption{\textbf{Step-wise emergence of intensity-invariant song representation along the full model pathway.} Input $\raw(t)$ consists of song component $\soc(t)$ scaled by $\sca$ with added noise component $\noc(t)$ and is processed up to the feature set $f_i(t)$. Different color shades indicate different types of Gabor kernels with specific lobe number $\kn$ and either $+$ or $-$ sign, sorted (dark to light) first by increasing $\kn$ and then by sign~($1\,\leq\,\kn\,\leq\,4$; first $+$, then $-$ for each $\kn$; five kernel widths $\kw$ of 1, 2, 4, 8, and $16\,$ms per type; 8 types, 40 kernels in total). \textbf{a}:~Example representations of $\filt(t)$, $\env(t)$, $\db(t)$, $\adapt(t)$, $c_i(t)$, and $f_i(t)$ for different $\sca$. \textbf{b}:~Intensity metrics over $\sca$. For $c_i(t)$ and $f_i(t)$, the median over kernels is shown. Dots indicate $95\,\%$ curve span for $\db(t)$, $\adapt(t)$, $c_i(t)$, and $f_i(t)$. \textbf{c}:~Average value $\mu_{f_i}$ of each feature $f_i(t)$ over $\sca$. \textbf{d}:~Ratios of intensity metrics to the respective reference value for input $\raw(t)=\noc(t)$. For $c_i(t)$ and $f_i(t)$, the median over kernel-specific ratios is shown. \textbf{e}:~Ratios of standard deviation $\sigma_{c_i}$ of each $c_i(t)$. \textbf{f}:~Ratios of $\mu_{f_i}$. \textbf{g}:~Distributions of kernel-specific $\sca$ that correspond to $95\,\%$ curve span for $c_i(t)$ and $f_i(t)$. Dots indicate the values from \textbf{b}. } \label{fig:pipeline_full} \end{figure} \FloatBarrier \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_invariance_short_Omocestus_rufipes.pdf} \caption{\textbf{Step-wise emergence of intensity invariant song representation along the model pathway without logarithmic compression.} Input $\raw(t)$ consists of song component $\soc(t)$ scaled by $\sca$ with added noise component $\noc(t)$ and is processed up to the feature set $f_i(t)$, skipping $\db(t)$. Different color shades indicate different types of Gabor kernels with specific lobe number $\kn$ and either $+$ or $-$ sign, sorted (dark to light) first by increasing $\kn$ and then by sign~($1\,\leq\,\kn\,\leq\,4$; first $+$, then $-$ for each $\kn$; five kernel widths $\kw$ of 1, 2, 4, 8, and $16\,$ms per type; 8 types, 40 kernels in total). \textbf{a}:~Example representations of $\filt(t)$, $\env(t)$, $\adapt(t)$, $c_i(t)$, and $f_i(t)$ for different $\sca$. \textbf{b}:~Intensity metrics over $\sca$. For $c_i(t)$ and $f_i(t)$, the median over kernels is shown. Dots indicate $95\,\%$ curve span for $f_i(t)$. \textbf{c}:~Average value $\mu_{f_i}$ of each feature $f_i(t)$ over $\sca$. \textbf{d}:~Ratios of intensity metrics to the respective reference value for input $\raw(t)=\noc(t)$. For $c_i(t)$ and $f_i(t)$, the median over kernel-specific ratios is shown. \textbf{e}:~Ratios of $\mu_{f_i}$. \textbf{f}:~Distribution of kernel-specific $\sca$ that correspond to $95\,\%$ curve span for $f_i(t)$. Dots indicate the value from \textbf{b}. } \label{fig:pipeline_short} \end{figure} \FloatBarrier \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_features_cross_species.pdf} \caption{\textbf{Interspecific and intraspecific feature variability.} Average value $\mu_{f_i}$ of each feature $f_i(t)$ against its counterpart from a 2nd feature set based on a different input $\raw(t)$. Each dot within a subplot represents a single feature $f_i(t)$. Different color shades indicate different types of Gabor kernels with specific lobe number $\kn$ and either $+$ or $-$ sign, sorted (dark to light) first by increasing $\kn$ and then by sign~($1\,\leq\,\kn\,\leq\,4$; first $+$, then $-$ for each $\kn$; five kernel widths $\kw$ of 1, 2, 4, 8, and $16\,$ms per type; 8 types, 40 kernels in total). Data is based on the analysis underlying Fig\,\ref{fig:pipeline_full}. \textbf{Lower triangular}:~Interspecific comparisons between single songs of different species. \textbf{Upper triangular}:~Intraspecific comparisons between different songs of a single species (\textit{O. rufipes}). \textbf{Lower left}:~Distribution of correlation coefficients $\rho$ for each interspecific and intraspecific comparison. Dots indicate single $\rho$ values. } \label{fig:feat_cross_species} \end{figure} \FloatBarrier \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_invariance_field.pdf} \caption{\textbf{Step-wise emergence of intensity invariant song representation along the model pathway.} } \label{fig:pipeline_field} \end{figure} \FloatBarrier \section{Conclusions \& outlook} \textbf{Song recognition pathway: Grasshopper vs. model:}\\ The model pathway includes a rather large number of Gabor kernels compared to the 15 to 20 ascending neurons in the grasshopper auditory system~(\bcite{stumpner1991auditory}). \textbf{Definition of invariance (general, systemic):}\\ Invariance = Property of a system to maintain a stable output with respect to a set of relevant input parameters (variation to be represented) but irrespective of one or more other parameters (variation to be discarded) $\rightarrow$ Selective input-output decorrelation \textbf{Definition of intensity invariance (context of neurons and songs):}\\ Intensity invariance = Time scale-selective sensitivity to certain faster amplitude dynamics (song waveform, small-scale AM) and simultaneous insensitivity to slower, more sustained amplitude dynamics (transient baseline, large-scale AM, current overall intensity level)\\ $\rightarrow$ Without time scale selectivity, any fully intensity-invariant output will be a flat line \textbf{Log-HP: Implication for intensity invariance:}\\ - Logarithmic scaling is essential for equalizing different song intensities\\ $\rightarrow$ Intensity information can be manipulated more easily when in form of a signal offset in log-space than a multiplicative scale in linear space - Capability to compensate for intensity variations, i.e. selective amplification of output $\adapt(t)$ relative to input $\env(t)$, is limited by input SNR (Eq.\,\ref{eq:toy_snr}):\\ $\rightarrow$ Ability to equalize between different sufficiently large scales of $s(t)$\\ $\rightarrow$ Inability to recover $s(t)$ when initially masked by noise floor $\eta(t)$ - Logarithmic scaling emphasizes small amplitudes (song onsets, noise floor) \\ $\rightarrow$ Recurring trade-off: Equalizing signal intensity vs preserving initial SNR \textbf{Thresh-LP: Implication for intensity invariance:}\\ - Role of song periodicity for feature representation! - Suggests a relatively simple rule for optimal choice of threshold value $\thr$:\\ $\rightarrow$ Find amplitude $c_i$ that maximizes absolute derivative of $c_i(t)$ over time\\ $\rightarrow$ Optimal with respect to intensity invariance of $f_i(t)$, not necessarily for other criteria such as song-noise separation or diversity between features - Nonlinear operations can be used to detach representations from graded physical stimulus (to fasciliate categorical behavioral decision-making?):\\ 1) Capture sufficiently precise amplitude information: $\env(t)$, $\adapt(t)$\\ $\rightarrow$ Closely following the AM of the acoustic stimulus\\ 2) Quantify relevant stimulus properties on a graded scale: $c_i(t)$\\ $\rightarrow$ More decorrelated representation, compared to prior stages\\ 3) Nonlinearity: Distinguish between "relevant vs irrelevant" values: $b_i(t)$\\ $\rightarrow$ Trading a graded scale for two or more categorical states\\ 4) Represent stimulus properties under relevance constraint: $f_i(t)$\\ $\rightarrow$ Graded again but highly decorrelated from the acoustic stimulus\\ 5) Categorical behavioral decision-making requires further nonlinearities\\ $\rightarrow$ Parameters of a behavioral response may be graded (e.g. approach speed), initiation of one behavior over another is categorical (e.g. approach/stay) \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_noise_env_sd_conversion_appendix.pdf} \caption{\textbf{} } \label{fig:app_env-sd} \end{figure} \FloatBarrier \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_invariance_rect-lp_appendix.pdf} \caption{\textbf{} } \label{fig:app_rect-lp} \end{figure} \FloatBarrier \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_invariance_log-hp_appendix.pdf} \caption{\textbf{} } \label{fig:app_log-hp_curves} \end{figure} \FloatBarrier \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_saturation_log-hp_appendix.pdf} \caption{\textbf{} } \label{fig:app_log-hp_saturation} \end{figure} \FloatBarrier \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_invariance_thresh-lp_pure_appendix.pdf} \caption{\textbf{} } \label{fig:app_thresh-lp_pure} \end{figure} \FloatBarrier \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_invariance_thresh-lp_noise_appendix.pdf} \caption{\textbf{} } \label{fig:app_thresh-lp_noise} \end{figure} \FloatBarrier \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_kernel_sd_perc_thresh_lp_appendix.pdf} \caption{\textbf{} } \label{fig:app_thresh-lp_kern-sd} \end{figure} \FloatBarrier \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_kernel_sd_perc_full_appendix.pdf} \caption{\textbf{} } \label{fig:app_full_kern-sd} \end{figure} \FloatBarrier \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_kernel_sd_perc_short_appendix.pdf} \caption{\textbf{} } \label{fig:app_short_kern-sd} \end{figure} \FloatBarrier \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_kernel_sd_perc_field_appendix.pdf} \caption{\textbf{} } \label{fig:app_field_kern-sd} \end{figure} \FloatBarrier \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_invariance_cross_species_thresh_appendix.pdf} \caption{\textbf{} } \label{fig:app_cross_species_thresh} \end{figure} \FloatBarrier \end{document}