\documentclass[a4paper, 12pt]{article} \usepackage[left=2cm,right=2cm,top=2cm,bottom=2cm,includeheadfoot]{geometry} % \usepackage[onehalfspacing]{setspace} \usepackage{graphicx} \usepackage{svg} \usepackage{import} \usepackage{float} \usepackage{placeins} \usepackage{parskip} \usepackage{amsmath} \usepackage{amssymb} \usepackage{subcaption} \usepackage[labelfont=bf, textfont=small]{caption} \usepackage[german,english]{babel} \addto\captionsenglish{\renewcommand{\figurename}{Fig.}} \addto\captionsenglish{\renewcommand{\tablename}{Tab.}} \usepackage[separate-uncertainty=true, locale=DE]{siunitx} \sisetup{output-exponent-marker=\ensuremath{\mathrm{e}}} % \usepackage[capitalize]{cleveref} % \crefname{figure}{Fig.}{Figs.} % \crefname{equation}{Eq.}{Eqs.} % \creflabelformat{equation}{#2#1#3} \usepackage[ backend=biber, style=authoryear, pluralothers=true, maxcitenames=1, mincitenames=1 ]{biblatex} \addbibresource{cite.bib} %\bibdata %\bibstyle %\citation \title{Emergent intensity invariance vs. signal-to-noise ratio at three consecutive processing stages along the grasshopper song recognition pathway} \author{Jona Hartling, Jan Benda} \date{} \begin{document} \maketitle{} % Text references and citations: \newcommand{\bcite}[1]{\mbox{\cite{#1}}} % \newcommand{\fref}[1]{\mbox{\cref{#1}}} % \newcommand{\fref}[1]{\mbox{Fig.\,\ref{#1}}} % \newcommand{\eref}[1]{\mbox{\cref{#1}}} % \newcommand{\eref}[1]{\mbox{Eq.\,\ref{#1}}} % Subplot lettering: \newcommand{\figa}{\textbf{a}} \newcommand{\figb}{\textbf{b}} \newcommand{\figc}{\textbf{c}} \newcommand{\figd}{\textbf{d}} \newcommand{\fige}{\textbf{e}} % Math shorthands - Standard symbols: \newcommand{\dec}{\log_{10}} % Logarithm base 10 \newcommand{\infint}{\int_{-\infty}^{+\infty}} % Indefinite integral % Math shorthands - Spectral filtering: \newcommand{\bp}{h_{\text{BP}}(t)} % Bandpass filter function \newcommand{\lp}{h_{\text{LP}}(t)} % Lowpass filter function \newcommand{\hp}{h_{\text{HP}}(t)} % Highpass filter function \newcommand{\fc}{f_{\text{cut}}} % Filter cutoff frequency \newcommand{\tlp}{T_{\text{LP}}} % Lowpass filter averaging interval \newcommand{\thp}{T_{\text{HP}}} % Highpass filter adaptation interval % Math shorthands - Early representations: \newcommand{\raw}{x_{\text{raw}}} % Placeholder input signal \newcommand{\filt}{x_{\text{filt}}} % Bandpass filtered signal \newcommand{\env}{x_{\text{env}}} % Signal envelope \newcommand{\db}{x_{\text{log}}} % Logarithmically scaled signal \newcommand{\dbref}{x_{\text{ref}}} % Decibel reference intensity \newcommand{\adapt}{x_{\text{adapt}}} % Adapted signal % Math shorthands - Kernel parameters: \newcommand{\kw}{\sigma} % Unspecific Gabor kernel width \newcommand{\kf}{\omega} % Unspecific Gabor kernel frequency \newcommand{\kp}{\phi} % Unspecific Gabor kernel phase \newcommand{\kn}{n} % Unspecific Gabor kernel lobe number \newcommand{\kwi}{\kw_i} % Specific Gabor kernel width \newcommand{\kfi}{\kf_i} % Specific Gabor kernel frequency \newcommand{\kpi}{\kp_i} % Specific Gabor kernel phase \newcommand{\kni}{\kn_i} % Specific Gabor kernel lobe number % Math shorthands - Auxiliary kernel parameters: \newcommand{\fdrm}{\text{FDRM}} % Gaussian full duration relative to maximum \newcommand{\rh}{h_{\text{rel}}} % Relative Gaussian height for FDRM calculation % Math shorthands - Thresholding nonlinearity: \newcommand{\thr}{\Theta_i} % Step function threshold value \newcommand{\nl}{H(c_i\,-\,\thr)} % Shifted Heaviside step function % Math shorthands - Intensity invariance analysis: \newcommand{\soc}{s} % Song component of synthetic mixture \newcommand{\noc}{\eta} % Noise component of synthetic mixture \newcommand{\sca}{\alpha} % Multiplicative scale of song component \newcommand{\xvar}{\sigma_{x}^{2}} % Variance of synthetic mixture \newcommand{\svar}{\sigma_{\text{s}}^{2}} % Song component variance \newcommand{\nvar}{\sigma_{\eta}^{2}} % Noise component variance \newcommand{\xsig}{\sigma_x} % Standard deviation of synthetic mixture \newcommand{\ssig}{\sigma_{\text{s}}} % Song component standard deviation \newcommand{\nsig}{\sigma_{\eta}} % Noise component standard deviation \newcommand{\pc}{p(c,\,T)} % Probability density (general interval) \newcommand{\pclp}{p(c,\,\tlp)} % Probability density (lowpass interval) \newcommand{\pci}{p(c_i,\,\tlp)} % Kernel-specific probability density (lowpass interval) \newcommand{\muf}{\mu_{f_i}} % Average feature value \section{Introduction} % % Drosophila/visual/article: % \bcite{ketkar2023multifaceted} % % Drosophila/auditory/article: % \bcite{ozeri2018fast} % % Primate/auditory/review: % \bcite{barbour2011intensity} % % Cricket/auditory/article: % \bcite{benda2008spike} % % Locust/auditory/article: % \bcite{clemens2010intensity} % % Rodent/olfactory/article: % \bcite{bolding2018recurrent} % Introduction to intensity invariance: Intensity invariance is a fundamental property of sensory systems across modalities and species, from fruit flies~(\bcite{ozeri2018fast}; \bcite{ketkar2023multifaceted}) over crickets~(\bcite{benda2008spike}) and grasshoppers~(\bcite{clemens2010intensity}) to rodents~(\bcite{bolding2018recurrent}) and primates~(\bcite{barbour2011intensity}). It allows for the robust recognition of behaviorally relevant stimuli despite variations in stimulus intensity. However, the computational mechanisms underlying intensity invariance are often difficult to disentangle. Here, we use a physiologically inspired functional model of the grasshopper song recognition pathway to investigate the emergence of intensity invariance throughout the auditory processing stream. % Why the grasshopper auditory system? % Why focus on song recognition among other auditory functions? The auditory system of grasshoppers~(\textit{Acrididae}) has been studied extensively over the years. Grasshoppers rely on their sense of hearing for intraspecific communication --- including mate attraction~(\bcite{helversen1972gesang}) and evaluation~(\bcite{stange2012grasshopper}), sender localization~(\bcite{helversen1988interaural}), courtship display~(\bcite{elsner1968neuromuskularen}), and rival deterrence~(\bcite{greenfield1993acoustic}) --- and have evolved a variety of acoustic signals for different behavioral contexts~(\bcite{otte1970comparative}). The most conspicuous acoustic signals of grasshoppers are their species-specific calling songs, which broadcast the presence of the singing individual to potential mates within range. These songs are usually more characteristic of a species than morphological traits~(\bcite{tishechkin2016acoustic}; \bcite{tarasova2021eurasius}), which can vary greatly within species~(\bcite{rowell1972variable}; \bcite{kohler2017morphological}). The reliance on songs to mediate reproduction represents a strong evolutionary driving force that resulted in a massive species diversification~(\bcite{vedenina2011speciation}; \bcite{sevastianov2023evolution}), with over 6800 recognized species in the \textit{Acrididae} family~(\bcite{cigliano2024orthoptera}). % What are the signals that the auditory system is supposed to recognize? Grasshopper songs are amplitude-modulated broad-band acoustic signals. They consist of a series of noisy syllables and relatively quiet pauses, which form a characteristic repetitive pattern~(\bcite{helversen1977stridulatory}; \bcite{stumpner1994song}). Song recognition depends on certain structural parameters of this pattern --- such as the duration of syllables and pauses~(\bcite{helversen1972gesang}), the slope of pulse onsets~(\bcite{helversen1993absolute}), and the accentuation of syllable onsets relative to the preceeding pause~(\bcite{balakrishnan2001song}; \bcite{helversen2004acoustic}) --- which are sufficiently conveyed by the amplitude modulation of the song alone~(\bcite{helversen1997recognition}). % Why is intensity invariance important for song recognition? Grasshopper songs, like all acoustic signals, are subject to sound attenuation, which depends on the distance from the sound source, the frequency content of the signal, and the vegetation of the habitat~(\bcite{michelsen1978sound}). Sound attenuation has two major consequences for song recognition. First, the amplitude dynamics of the song pattern degrade with increasing distance to the sender, which limits the effective communication range of grasshoppers to~\mbox{1\,-\,2\,m} in their typical grassland habitats~(\bcite{lang2000acoustic}). Second, the intensity of a song at the receiver's position varies with the position of the sender, which should ideally not affect song recognition. The auditory system thus needs to achieve a certain degree of intensity invariance --- a time scale-selective sensitivity to faster amplitude dynamics and simultaneous insensitivity to more sustained amplitude dynamics. Intensity invariance is commonly associated with neural adaptation~(\bcite{benda2008spike}; \bcite{barbour2011intensity}; \bcite{ozeri2018fast}; more general:~\bcite{benda2021neural}). Different neuron types in the grasshopper auditory system exhibit spike-frequency adaptation in response to sustained stimulation~(\bcite{romer1976informationsverarbeitung}; \bcite{gollisch2004input}; \bcite{hildebrandt2009origin}; \bcite{clemens2010intensity}; \bcite{fisch2012channel}). Accordingly, intensity invariance is not the result of a single processing step but rather a gradual process, in which different neuronal populations contribute to varying degrees~(\bcite{clemens2010intensity}) and by different mechanisms~(\bcite{hildebrandt2009origin}). % How did we expand on the previous framework (feat. Clemens et al.)? In the current study, we leverage functional modelling to trace the emergence of intensity invariance through individual processing steps of the grasshopper song recognition pathway. The model pathway we propose here is based on a previous functional model framework for song recognition in both crickets~(\bcite{clemens2013computational}; \bcite{hennig2014time}) and grasshoppers~(\bcite{clemens2013feature}; review on both:~\bcite{ronacher2015computational}). The exisiting framework relies on pulse trains as input signals, which were designed to capture the essential structural properties of natural song envelopes~(\bcite{clemens2013feature}). It includes feature extraction by a bank of linear-nonlinear feature detectors, evidence accumulation by temporal averaging of each feature, and categorical decision making by a weighted linear combination of feature values. We adopted the general structure of the existing framework and extended it by a physiologically plausible preprocessing stage --- including spectral filtering, envelope extraction, logarithmic compression, and intensity adaptation --- which allows the model to operate on unmodified recordings of natural grasshopper songs. The resulting model pathway thus covers the entire auditory processing stream from the initial reception of airborne sound waves to the generation of a high-dimensional feature representation that allows for the categorical recognition of conspecific songs. It incorporates anatomical, physiological, and ethological evidence from several decades of research on the grasshopper auditory system. In the following, we provide a side-by-side account of the known physiological processing steps along the song recognition pathway and their functional approximations in the model pathway. We then elaborate on the computational mechanisms that contribute to the emergence of intensity-invariant song representations, the interaction between these mechanisms, the overall capacity for intensity invariance in the system, and the ethological implications of our findings. \newpage \section{Methods} % This maybe does not quite fit here, but it is the most general part of the % methods and applies throughout the whole section, so I put it here for now. All modeling, data analysis, and data visualization was performed in Python~3.12.3 except for the pathway overview~(Fig.\,\ref{fig:pathway}), which was assembled in Inkscape~1.2. The code base for the model pathway is available as the \textit{thunderhopper} package, version 1.0, on PyPi. Any audio data was inspected and edited with the help of the \textit{audian} package, version 2.4, on PyPi. \subsection{Functional model of the grasshopper song recognition pathway} The anatomical organisation of the grasshopper song recognition pathway can be outlined as a feed-forward network of three consecutive neuronal populations~(Fig.\,\ref{fig:pathway}a-c): Peripheral auditory receptor neurons, whose axons enter the ventral nerve cord (VNC) at the level of the metathoracic ganglion; local interneurons that remain exclusively within the thoracic region of the VNC; and ascending neurons projecting from the thoracic region towards the supraesophageal ganglion (SEG), or central brain~(\bcite{rehbein1974structure}; \bcite{rehbein1976auditory}; \bcite{eichendorf1980projections}). The input to the network originates at the tympanal membrane, which acts as acoustic receiver and is coupled to the dendritic endings of the receptor neurons~(\bcite{gray1960fine}). The outputs from the network converge in the SEG, which presumably harbors the neuronal substrate for conspecific song recognition and response initiation~(\bcite{ronacher1986routes}; \bcite{bauer1987separate}; \bcite{bhavsar2017brain}). Around 15 to 20 ascending neurons have been identified in the grasshopper auditory system~(\bcite{stumpner1991auditory}), whose functional characteristics are conserved even between species that are not closely related~(\bcite{neuhofer2008evolutionarily}). The population of ascending neurons possesses a diverse range of response properties that contrasts with the rather homogeneous responses of receptor neurons and local interneurons~(\bcite{clemens2011efficient}), which suggests a transition from a uniform population-wide processing stream into several parallel branches. Accordingly, the model pathway is divided into two distinct stages~(Fig.\,\ref{fig:pathway}d): The preprocessing stage incorporates the processing steps at the levels of the tympanal membrane, the receptor neurons, and the local interneurons; and operates on one-dimensional signal representations~(Fig.\,\ref{fig:stages_pre}). The feature extraction stage corresponds to the processing within the ascending neurons and further downstream towards the SEG; and operates on high-dimensional signal representations~(Fig.\,\ref{fig:stages_feat}). The details of each physiological processing step and its functional approximation are described in the following sections. Around 15 to 20 ascending neurons have been identified in the grasshopper auditory system~(\bcite{stumpner1991auditory}), whose functional characteristics are conserved even between species that are not closely related~(\bcite{neuhofer2008evolutionarily}). The population of ascending neurons possesses a diverse range of response properties that contrasts with the rather homogeneous responses of receptor neurons and local interneurons~(\bcite{clemens2011efficient}), which suggests a transition from a uniform population-wide processing stream into several parallel branches. Accordingly, the model pathway is divided into two distinct stages~(Fig.\,\ref{fig:pathway}d): The preprocessing stage incorporates the processing steps at the levels of the tympanal membrane, the receptor neurons, and the local interneurons; and operates on one-dimensional signal representations~(Fig.\,\ref{fig:stages_pre}). The feature extraction stage corresponds to the processing within the ascending neurons and further downstream towards the SEG; and operates on high-dimensional signal representations~(Fig.\,\ref{fig:stages_feat}). The details of each physiological processing step and its functional approximation are described in the following sections. \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_auditory_pathway.pdf} \caption{\textbf{Schematic organisation of the grasshopper song recognition pathway and structure of the functional model pathway.} \textbf{a}:~Simplified course of the pathway in the grasshopper, from the tympanal membrane over receptor neurons, local interneurons, and ascending neurons further towards the supraesophageal ganglion. \textbf{b}:~Schematic of synaptic connections between the three neuronal populations within the metathoracic ganglion. \textbf{c}:~Network representation of neuronal connectivity. \textbf{d}:~Flow diagram of consecutive signal representations~(boxes) and transformations~(arrows) along the model pathway. All representations are time-varying. 1st half: Preprocessing stage~(one-dimensional representations). 2nd half: Feature extraction stage~(high-dimensional representations). } \label{fig:pathway} \end{figure} \subsubsection{Population-driven signal preprocessing} Grasshoppers receive airborne sound waves by a tympanal organ at each side of the body. The tympanal membrane acts as a mechanical resonance filter for sound-induced vibrations~(\bcite{windmill2008time}; \bcite{malkin2014energy}). Vibrations that fall within specific frequency bands are focused on different membrane areas, while others are attenuated. This processing step can be approximated by an initial bandpass filter~(Fig.\,\ref{fig:stages_pre}a) applied to the acoustic input signal $\raw(t)$: \begin{equation} \filt(t)\,=\,\raw(t)\,*\,\bp, \qquad \fc\,=\,5\,\text{kHz},\,30\,\text{kHz} \label{eq:bandpass} \end{equation} The receptor neurons transduce the vibrations of the tympanal membrane into sequences of action potentials. They thereby encode the amplitude modulation, or envelope, of the signal~(\bcite{machens2001discrimination}), which likely involves a rectifying nonlinearity~(\bcite{machens2001representation}). The extraction of the signal envelope~(Fig.\,\ref{fig:stages_pre}b) can be modelled as full-wave rectification followed by lowpass filtering of the tympanal signal $\filt(t)$: \begin{equation} \env(t)\,=\,|\filt(t)|\,*\,\lp, \qquad \fc\,=\,250\,\text{Hz} \label{eq:env} \end{equation} Furthermore, the receptors exhibit a sigmoidal response curve over logarithmically compressed stimulus intensities~(\bcite{suga1960peripheral}; \bcite{gollisch2002energy}). In the model pathway, logarithmic compression~(Fig.\,\ref{fig:stages_pre}c) is achieved by conversion to decibel scale \begin{equation} \db(t)\,=\,20\,\cdot\,\dec \frac{\env(t)}{\dbref}, \qquad \dbref\,=\,1 \label{eq:log} \end{equation} relative to the common reference intensity $\dbref$. Both the receptor neurons~(\bcite{romer1976informationsverarbeitung}; \bcite{gollisch2004input}; \bcite{fisch2012channel}) and, on a larger scale, the subsequent local interneurons~(\bcite{hildebrandt2009origin}; \bcite{clemens2010intensity}) adapt their firing rates in response to sustained stimulus intensities, which allows for the robust encoding of faster amplitude modulations against a slowly changing overall baseline intensity. Functionally, the adaptation mechanism resembles a highpass filter~(Fig.\,\ref{fig:stages_pre}d) over the logarithmically compressed envelope $\db(t)$: \begin{equation} \adapt(t)\,=\,\db(t)\,*\,\hp, \qquad \fc\,=\,10\,\text{Hz} \label{eq:highpass} \end{equation} This processing step concludes the preprocessing stage of the model pathway. The resulting intensity-adapted envelope $\adapt(t)$ is then passed on from the local interneurons to the ascending neurons, where it serves as the basis for the following feature extraction stage. \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_pre_stages.pdf} \caption{\textbf{Song representations during the preprocessing stage.} Example song of \textit{O. rufipes}. \textbf{a}:~Bandpass filtered tympanal signal $\filt(t)$. \textbf{b}:~Signal envelope $\env(t)$. \textbf{c}:~Logarithmically compressed envelope $\db(t)$. \textbf{d}:~Intensity-adapted envelope $\adapt(t)$. } \label{fig:stages_pre} \end{figure} \FloatBarrier \subsubsection{Feature extraction by individual neurons} The population of ascending neurons extracts and encodes a number of different features of the preprocessed signal, and hence represents the signal in a higher-dimensional space than the preceding receptor neurons and local interneurons~(\bcite{clemens2011efficient}). Each ascending neuron is assumed to scan the signal for a specific template pattern, which can be thought of as a kernel of a particular structure and on a particular time scale. This process, known as template matching, can be modelled as a convolution of the intensity-adapted envelope $\adapt(t)$ with a kernel $k_i(t)$ specific to the $i$-th ascending neuron: \begin{equation} c_i(t)\,=\,\adapt(t)\,*\,k_i(t) = \infint \adapt(\tau)\,\cdot\,k_i(t\,-\,\tau)\,d\tau \label{eq:conv} \end{equation} We use Gabor kernels as basis functions for creating different template patterns. Gabor functions presumably capture the essential structural properties of the filter functions found in various auditory neurons~(\bcite{rokem2006spike}; \bcite{clemens2011efficient}; \bcite{clemens2012nonlinear}). An arbitrary one-dimensional, real Gabor kernel is generated by multiplication of a Gaussian envelope with standard deviation or kernel width $\kwi$ and a sinusoidal carrier with frequency $\kfi$ and phase $\kpi$: \begin{equation} k_i(t,\,\kwi,\,\kfi,\,\kpi)\,=\,e^{-\frac{t^{2}}{2{\kwi}^{2}}}\,\cdot\,\sin(\kfi\,t\,+\,\kpi), \qquad \kfi\,=\,2\pi f_{\text{sin}_i} \label{eq:gabor} \end{equation} Different combinations of $\kwi$ and $\kfi$ result in Gabor kernels with different lobe number $\kni$, which is the number of half-periods of the carrier that fit under the Gaussian envelope within reasonable limits of attenuation. The time window under the Gaussian envelope that contains the relevant lobes of the kernel can be defined as Gaussian full duration at height $\rh$ relative to the maximum of the Gaussian: \begin{equation} \fdrm(\kwi,\,\rh)\,=\,2\,\cdot\,\sqrt{-2\,\cdot\,\ln \rh}\cdot\,\kwi, \qquad \rh\,\in\,(0,\,1] \label{eq:fdrm} \end{equation} % Yes, FDRM is a hideous acronym. Based on the common "full width at half % maximum" (FWHM) and adjusted because "full duration at half maximum" (FDHM) % is apparently preferred in a temporal context. Alternatively, "w_\text{gauss}"? With this, an appropriate carrier frequency $\kfi$ for obtaining a Gabor kernel with width $\kwi$ and desired lobe number $\kni$ can be approximated as \begin{equation} \kfi(\kni,\,\kwi,\,\rh)\,=\,\frac{0.5\,\cdot\,\kni\,+\,\beta_0}{\fdrm(\kwi,\,\rh)}, \qquad \kni\,\geq\,2\enspace\forall\enspace \kni\,\in\,\mathbb{Z} \label{eq:gabor_freq} \end{equation} % \begin{equation} % \kfi(\kni,\,\kwi,\,\rh)\,=\,\frac{0.5\,\cdot\,\kni\,+\,\beta_0}{2\,\cdot\,\sqrt{-2\,\cdot\,\ln \rh}\cdot\kwi}, \qquad \kni\,\geq\,2\enspace\forall\enspace \kni\,\in\,\mathbb{Z} % \end{equation} The relationship between $\kfi$ and $\kni$ is approximately linear except for small $\kni$. The offset term $\beta_0\approx0.26$ was added to balance the amplitudes of the $\kni$ desired lobes of the kernel --- which should be maximized --- against the amplitudes of the next-outer lobes, which should not exceed the threshold value determined by $\rh$. Note that simple Gaussian kernels with $\kni=1$ can be obtained by setting the carrier frequency to $\kfi=0$ and are hence not covered by Eq.\,\ref{eq:gabor_freq}. Carrier phase $\kpi$ determines the position of the kernel lobes relative to the kernel center. We restrict the Gabor kernels to be either even or odd functions by setting $\kpi$ to one of only four specific phase values~(Tab.\,\ref{tab:gabor_phases}). Even Gabor kernels are mirror-symmetric with uneven $\kni$, whereas odd Gabor kernels are point-symmetric with even $\kni$. Both even and odd kernels can have either positive or negative sign, which refers to the sign of the kernel's central lobe (even kernels) or the left of the two central lobes (odd kernels). These four major groups of Gabor kernels allow for the extraction of different types of signal features, such as the presence of peaks (even, $+$), troughs (even, $-$), onsets (odd, $+$), and offsets (odd, $-$) at various time scales. \FloatBarrier \begin{table}[!ht] \centering \captionsetup{width=.45\textwidth} \caption{Values of phase $\kp$ that are specific for the four major groups of Gabor kernels.} \begin{tabular}{|ccc|} \hline sign & even kernels & odd kernels\\ \hline $+$ & $+\pi\,/\,2$ & $\pi$\\ $-$ & $-\pi\,/\,2$ & $0$\\ \hline \end{tabular} \label{tab:gabor_phases} \end{table} \FloatBarrier Following the convolutional template matching~(Fig.\,\ref{fig:stages_feat}a), each kernel-specific response $c_i(t)$ is passed through a shifted Heaviside step-function $\nl$ with threshold value $\thr$ to obtain a binary response~(Fig.\,\ref{fig:stages_feat}b): \begin{equation} b_i(t,\,\thr)\,=\,\begin{cases} \;1, \quad c_i(t)\,>\,\thr\\ \;0, \quad c_i(t)\,\leq\,\thr \end{cases} \label{eq:binary} \end{equation} The thresholding of $c_i(t)$ into $b_i(t)$ can be thought of as a categorization into "relevant" and "irrelevant" response values. Similar thresholding nonlinearities have been a crucial processing step in previous models that deal with the extraction of behaviorally relevant song features in insects~(\bcite{clemens2013computational}; \bcite{clemens2013feature}; \bcite{hennig2014time}; \bcite{ronacher2015computational}). % However, there is no direct physiological evidence that would allow to % determine the exact location or underlying mechanism of such a nonlinearity in % either the ascending neurons or at some point further downstream in the SEG. In the grasshopper, the responses of the ascending neurons are assumed to be integrated somewhere in the SEG~(\bcite{ronacher1986routes}; \bcite{bauer1987separate}; \bcite{bhavsar2017brain}). In the model pathway, temporal integration is implemented as temporal averaging of the binary responses $b_i(t)$ by a lowpass filter with extremely low cutoff frequency: \begin{equation} f_i(t)\,=\,b_i(t)\,*\,\lp, \qquad \fc\,=\,1\,\text{Hz} \label{eq:lowpass} \end{equation} This processing step results in a set of slowly changing kernel-specific features $f_i(t)$, which is the final representation along the model pathway~(Fig.\,\ref{fig:stages_feat}c). In the resulting high-dimensional feature space, different species-specific song patterns can be distinguished by their distinct combination of feature values, e.\,g. using Euclidian geometry or a simple linear classifier. \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_feat_stages.pdf} \caption{\textbf{Song representations during the feature extraction stage.} Example song of \textit{O. rufipes}. Different color shades indicate different types of Gabor kernels with specific lobe number $\kni$ and either $+$ or $-$ sign, sorted (dark to light) first by increasing $\kni$ and then by sign~($1\,\leq\,\kni\,\leq\,4$; first $+$, then $-$ for each $\kni$; two kernel widths $\kwi$ of $4\,$ms and $32\,$ms per type; 8 types, 16 kernels in total). \textbf{a}:~Kernel-specific filter responses $c_i(t)$. \textbf{b}:~Binary responses $b_i(t)$. \textbf{c}:~Finalized features $f_i(t)$.} \label{fig:stages_feat} \end{figure} \FloatBarrier \subsection{Simulation-based analysis of the model pathway} \subsubsection{Data sourcing} All simulations were based on a dataset that was assembled from five different sources, each of which is an established reference for the identification of European grasshopper species. The dataset was limited to six species from the species-rich \textit{Gomphocerinae} sub-family that are known to be common throughout Central and Southern Europe. All recordings were converted to standard~\textit{.wav}~format with a sampling rate of~44.1\,kHz and an amplitude scale in arbitrary units. Individual songs were then cut from each recording. The dataset includes a total of 31 recordings across species, which amounts to a total of 153 isolated songs. However, the number of available species-specific songs varies greatly across species, with a maximum of 48 songs for \textit{C. biguttulus} and a minimum of 6 songs for \textit{C. mollis}~(Tab.\,\ref{tab:species_list}). \begin{itemize} \item "Heuschrecken beobachten, bestimmen" by~Heiko~Bellmann\\ 1$^{\text{st}}$\,edition, 1993, Naturbuch, Augsburg \item "Gesänge der heimischen Heuschrecken. Akustisch-optische Bestimmungshilfe."\\ by~Karl-Heinz~Garberding, Deutscher Jugendbund für Naturbeobachtung\\ 1$^{\text{st}}$\,edition, 2001, DJN, Göttingen \item "Heuschrecken -- Die Stimmen von 61 heimischen Arten" by~Heiko~Bellmann\\ 1$^{\text{st}}$\,edition, 2004, AMPLE, Germering \item "Fauna d'Italia XLVIII -- Orthoptera" by~Bruno~Massa, Paolo~Fontana, Filippo~M.~Buzzetti, Roy~M.J.C.~Kleukers, Baudewijn~Odé\\ 1$^{\text{st}}$\,edition, 2012, edagricola, Milano \item "Singing Orthoptera of Slovenia" by~Stanislav~Gomboc, Blaz~Segula\\ 1$^{\text{st}}$\,edition, 2014, EGEA, Ljubljana \end{itemize} \begin{table}[!ht] \centering \captionsetup{width=.75\textwidth} \caption{Overview of the six grasshopper species from the \textit{Gomphocerinae} sub-family, the number of sources per species, the number of available recordings across sources, and the number of isolated songs across recordings.} \begin{tabular}{|lccc|} \hline \textbf{Species} & \textbf{Sources} & \textbf{Recordings} & \textbf{Songs}\\ \hline \textit{Chorthippus biguttulus} & 5 & 6 & 48\\ \textit{Chorthippus mollis} & 3 & 3 & 6\\ \textit{Chrysochraon dispar} & 4 & 5 & 45\\ \textit{Gomphocerippus rufus} & 4 & 8 & 16\\ \textit{Omocestus rufipes} & 4 & 5 & 14\\ \textit{Pseudochorthippus parallelus} & 4 & 4 & 24\\ \hline \end{tabular} \label{tab:species_list} \end{table} \subsubsection{Generation of synthetic input signals} Different processing steps along the model pathway were tested for intensity invariance by generating synthetic input signals $x(t)$ of varying intensity, transforming them through the respective processing steps, and comparing the resulting signal representations. Inputs were generated for two distinct cases. In the idealized, noiseless case, $x(t)$ consists of a song component $\soc(t)$ with $\ssig=1$ and a multiplicative scale $\sca$: \begin{equation} x(t)\,=\,\sca\,\cdot\,\soc(t), \qquad \sca\,\geq\,0 \label{eq:noiseless} \end{equation} In the noiseless case, $x(t)$ is hence only a scaled version of $\soc(t)$ with $\xsig=\sca$. In the more realistic, noisy case, $x(t)$ consists of the same song component $\soc(t)$ scaled by $\sca$ and an additive noise component $\noc(t)$ with $\nsig=1$: \begin{equation} x(t)\,=\,\sca\,\cdot\,\soc(t)\,+\,\noc(t), \qquad \sca\,\geq\,0 \label{eq:noisy} \end{equation} Accordingly, the signal-to-noise ratio (SNR) of input $x(t)$ in the noisy case equals the squared $\sca$ value: \begin{equation} \text{SNR}_x(\sca)\,=\,\frac{(\sca\,\cdot\,\ssig)^2}{\nsig^2}\,=\,\sca^2, \qquad \ssig\,=\,\nsig\,=\,1 \label{eq:input_snr} \end{equation} For most analyses, it would be sufficient if input $x(t)$ corresponds to the signal representation immediately before the first of the tested transformations. For instance, when testing the effects of logarithmic compression~(Eq.\,\ref{eq:log}), $x(t)$ would correspond to the signal envelope $\env(t)$. However, in this particular case, $\env(t)$ results from a nonlinear transformation~(Eq.\,\ref{eq:env}), which cannot be synthesized as an additive mixture of $\soc(t)$ and $\noc(t)$. For this reason, any input $x(t)$ across all analyses corresponds not to the representation immediately before the tested transformations but its predecessor representation instead. Therefore, when testing logarithmic compression, $x(t)$ corresponds to the tympanal signal $\filt(t)$ instead of $\env(t)$. The raw $\soc(t)$ was drawn from the dataset of isolated species-specific song recordings, whereas the raw $\noc(t)$ consists of a segment of normally distributed white noise. Both $\soc(t)$ and $\noc(t)$ were normalized to unit standard deviation. These can be used without further processing for all analyses where input $x(t)$ corresponds to $\raw(t)$. For analyses where $x(t)$ corresponds to a later representation, $\soc(t)$ and $\noc(t)$ were first processed along the model pathway up to the required representation, again normalized to unit standard deviation, and then used to generate $x(t)$ according to either Eq.\,\ref{eq:noiseless} in the noiseless case or Eq.\,\ref{eq:noisy} in the noisy case. \subsubsection{Quantifying signal intensity across representations} \label{sec:intensity_measures} All intensity measures were calculated over a manually labeled segment within each song. Segments always excluded the first and last few syllables to allow slowly changing representations such as $f_i(t)$ to stabilize. The duration of each segment and the number of contained syllables depends on the duration of the species-specific song. Care was taken to ensure that the segment contained a sufficient number of syllables to obtain a reliable estimate of the intensity measures. The standard deviation $\sigma$ was used as a measure of intensity for all representations resulting from the transformation of input $x(t)$ up to and including the kernel responses $c_i(t)$, for which individual $\sigma_{c_i}$ were used as kernel-specific intensity measures. The binary responses $b_i(t)$ were deemed to similar to the features $f_i(t)$ to warrant their own intensity measure and were hence omitted from all related analyses. For $f_i(t)$, $\sigma$ is not an appropriate intensity measure because each $f_i(t)$ is ideally constant with $\sigma=0$ for the duration of a song. Therefore, the average value $\muf$ of each $f_i(t)$ was used as a kernel-specific intensity measure instead. It is arguably not ideal to quantify the intensity of $c_i(t)$ and $f_i(t)$ separately for each kernel. Overall, these representations are not separate signals bundled together but rather a set that acts as a unit with a single intensity measure. However, there is no straightforward way to quantify the intensity of $c_i(t)$ or $f_i(t)$ as a whole that would not entail a certain ambiguity, e.\,g by averaging across kernels. In this sense, we opted for the kernel-specific approach because it allows to asses differences in the dependency on $\sca$ between individual members of either $c_i(t)$ and $f_i(t)$. The absolute intensity measures allow to compare the intensity of a representation across different $\sca$ values. Additionally, ratios were calculated between the intensity measures for $\sca>0$ and the respective pure-noise reference measure for $\sca=0$ to better compare the intensities of different representations. This is only possible in the noisy case, where input $x(t)=\noc(t)$ for $\sca=0$, whereas $x(t)=0$ for $\sca=0$ in the noiseless case. At the level of input $x(t)$, the ratio of intensity measures depends on the square root of $\sca$: \begin{equation} \frac{\xsig}{\nsig}\,=\,\sqrt{\frac{\xsig^2}{\nsig^2}}\,=\,\sqrt{\frac{(\sca\,\cdot\,\ssig)^2\,+\,\nsig^2}{\nsig^2}}\,=\,\sqrt{\sca^2\,+\,1}, \qquad \ssig\,=\,\nsig\,=\,1 \label{eq:input_ratio} \end{equation} This holds only if $\soc(t)\perp\noc(t)$, so that $\xsig^2=\ssig^2+\nsig^2$, which is a reasonable assumption for the raw $\soc(t)$ and $\noc(t)$. However, the dependency of the ratio on $\sca$ is not necessarily the same for representations that are transformed from $x(t)$ by nonlinear operations, since these change the relationship of $\soc(t)$ and $\noc(t)$ in an unpredictable fashion~(see appendix Fig.\,\ref{fig:app_env-sd}). Furthermore, the ratio is not a proper SNR of the representation because it does not relate $\soc(t)$ to $\noc(t)$ within the representation but rather the entire representation to $\noc(t)$ alone. However, it still provides a useful measure of the relative intensity of a representation with and without $\soc(t)$, which is the closest we can get to the SNR of the representation. As such, the ratio of intensity measures is referred to as SNR in the following. % Is this legal? "SNR" is much shorter than "ratio of intensity measure to the pure-noise reference measure". % Haven't used it much yet, sticked to "ratio" in most cases. \subsection{Field data-based analysis of the model pathway} Field recordings were taken on a meadow in the vicinity of the University of Tübingen, Germany, during the day in August~2024. All recordings were taken using a custom hand-held microphone array that was assembled from eight omnidirectional AV-TEFE TCM141 condenser microphones. The microphones were arranged in a linear configuration with a spacing of 30\,cm between adjacent microphones and oriented in the same direction along the axis of the array. All microphones were connected to a custom 8-channel amplificitation and digitization system based on a Teensy 4.1 microcontroller with real-time clock and microSD card storage. Recordings were written to the microSD card in~\textit{.wav}~format with a sampling rate of 96\,kHz and an amplitude scale in arbitrary units. The microphone array was held at a height of approximately 30\,cm above the ground, which was slightly above the height of most surrounding vegetation and at the same height as the singing grasshopper. The array was moved as close to the grasshopper as possible without interrupting its song production, which amounts to an approximate offset distance of 10\,cm between the animal and the leading microphone. Care was taken to maintain a stable position and height of the microphone array during recording. The resulting recordings were then processed through the model pathway and analyzed according to the procedure described in Section~\ref{sec:intensity_measures}. \subsection{Determining kernel-specific threshold values} Different kernels $k_i(t)$ result in specific kernel responses $c_i(t)$, Eq.\,\ref{eq:conv}, which are then transformed further into binary responses $b_i(t)$, Eq.\,\ref{eq:binary}, by thresholding nonlinearity $\nl$. The threshold value $\thr$ is specific to each $k_i(t)$. Across all analyses, $\thr$ has been specified as a multiple of the pure-noise reference standard deviation $\sigma_{c_i}$ for input $x(t)=\noc(t)$. This ensures that $\thr$ as well as the resulting $b_i(t)$ and $f_i(t)$ are comparable across different $k_i(t)$ because each pure-noise $c_i(t)$ approximately follows a normal distribution around zero~(see appendix Figs.\,\ref{fig:app_thresh-lp_kern-sd}-\ref{fig:app_field_kern-sd}). \newpage \section{Results} \subsection{Mechanisms driving the emergence of intensity invariance} It is not necessary to test each processing step along the model pathway for intensity invariance. Instead, we can focus on those steps that involve nonlinear transformations, since these are the only steps that can potentially change the dependency on scale $\sca$ between the input and output representations. Overall, there are three nonlinear transformations along the model pathway: Full-wave rectification during envelope extraction, logarithmic compression, and the thresholding nonlinearity during feature extraction. In the following, we analyze the effects of each of these transformations on the intensity and SNR of the resulting representations as well as their potential contribution to intensity invariance. \subsubsection{Full-wave rectification \& lowpass filtering} The first nonlinear transformation along the model pathway is the full-wave rectification of the tympanal signal $\filt(t)$ during the extraction of the signal envelope (Eq.\,\ref{eq:env}). Rectification transforms the distribution of $\filt(t)$ from an approximately zero-centered distribution with both positive and negative values into a strictly non-negative distribution. Signal envelope $\env(t)$ is then obtained by lowpass filtering the rectified $\filt(t)$. The effects of this transformation pair on SNR and potential intensity invariance were analyzed by rescaling and processing the input signal $\raw(t)$ and comparing standard deviations between the resulting $\filt(t)$ and $\env(t)$, once for the noiseless case~(Fig.\,\ref{fig:rect-lp}a) and once for the noisy case~(Fig.\,\ref{fig:rect-lp}b). In addition, the cutoff frequency $\fc$ of the lowpass filter was varied to investigate the influence of different filter bandwidths. In the noiseless case, the standard deviations of $\filt(t)$ and $\env(t)$ are each reduced compared to the input $\raw(t)$ by a multiplicative factor. These factors are constant across all $\sca$, which results in a downward shift of the respective curve on a double-logarithmic scale, away from the diagonal~(Fig.\,\ref{fig:rect-lp}c). For $\filt(t)$, the reduction is a consequence of the bandpass filtering~(Eq.\,\ref{eq:bandpass}) of $\raw(t)$. For $\env(t)$, the standard deviation is further reduced compared to $\filt(t)$. Rectification contributes much less to this reduction than lowpass filtering. The degree of reduction by lowpass filtering depends on the cutoff frequency $\fc$, with lower $\fc$ (narrow bandwidth) resulting in a stronger reduction. In the noisy case, the standard deviations of $\filt(t)$ and $\env(t)$ can be related to the respective pure-noise reference standard deviation~(Fig.\,\ref{fig:rect-lp}d). This causes each curve to start with a constant regime of SNR values near 1 for smaller $\sca$, which reflects the dominance of the noise component $\noc(t)$ over the song component $\soc(t)$ in the input $\raw(t)$. For larger $\sca$, all curves transition into a regime of linearly increasing SNR on a double-logarithmic scale. For $\filt(t)$, the linear part of the curve deviates only slightly from the diagonal. For $\env(t)$, however, the transition occurs at lower $\sca$ compared to $\filt(t)$, and the linear part of the curve is shifted leftward away from the diagonal, which means that higher SNR values are achieved for the same $\sca$. This effect is more pronounced for lower $\fc$ of the lowpass filter and is presumably caused by the attenuation of high-frequency components in the signal, which are more prominent in the noise component $\noc(t)$ than in the song component $\soc(t)$. The effect also appears relatively consistent across different species, although small variations exist~(Fig.\,\ref{fig:rect-lp}e and appendix Fig.\,\ref{fig:app_rect-lp}). In summary, the standard deviation of $\env(t)$ has never been observed to saturate for larger $\sca$ but rather continues to increase proportionally to $\sca$ for all tested $\fc$, in both the noiseless and the noisy case and across different species. Consequently, the combination of rectification and lowpass filtering does not contribute to intensity invariance. However, this transformation pair does improve the SNR of $\env(t)$ relative to $\filt(t)$ and thus provides subsequent processing stages with a more robust input representation and higher input SNR. \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_invariance_rect_lp.pdf} \caption{\textbf{Rectification and lowpass filtering improves SNR but does not contribute to intensity invariance.} Input $\raw(t)$ consists of $\soc(t)$ scaled by $\sca$ with optional $\noc(t)$ and is successively transformed into tympanal signal $\filt(t)$ and envelope $\env(t)$. \textbf{Top}:~Examples of $\filt(t)$ and $\env(t)$ for different $\sca$. \textbf{a}:~Noiseless case. \textbf{b}:~Noisy case. \textbf{Bottom}:~Intensity measures over $\sca$. Different line styles indicate different cutoff frequencies $\fc$ of the lowpass filter extracting $\env(t)$. \textbf{c}:~Noiseless case: Standard deviation $\sigma_x$ of $\filt(t)$ and $\env(t)$, respectively. \textbf{d}:~Noisy case: Ratio of $\sigma_x$ to the respective pure-noise reference $\sigma_{\eta}$ for $\sca=0$. \textbf{e}:~Ratio of $\sigma_x$ to $\sigma_{\eta}$ of $\env(t)$ as in \textbf{d} for different species (averaged over songs and recordings, appendix Fig.\,\ref{fig:app_rect-lp}). } \label{fig:rect-lp} \end{figure} \FloatBarrier \subsubsection{Logarithmic compression \& spike-frequency adaptation} The second nonlinear transformation along the model pathway is the logarithmic compression of the signal envelope $\env(t)$ into $\db(t)$, Eq.\,\ref{eq:log}, which is then followed by the highpass filtering of $\db(t)$, Eq.\,\ref{eq:highpass}, to obtain the intensity-adapted envelope $\adapt(t)$. The interplay of this transformation pair was analyzed by rescaling and processing the input signal $\filt(t)$ and comparing standard deviations between the resulting $\env(t)$, $\db(t)$, and $\adapt(t)$. It is necessary to use $\filt(t)$ as input for this analysis instead of $\env(t)$, because $\env(t)$ results from a nonlinear transformation and hence cannot be synthesized as an additive mixture of song component $\soc(t)$ and noise component $\noc(t)$. % <-- Sentence may be methods section material. However, it is much easier to conceive a mathematical description of the effects of logarithmic compression and adaptation if $\env(t)$ itself is assumed to be composed of $\soc(t)$ and $\noc(t)$. In the noiseless case~(Fig.\,\ref{fig:log-hp}a), $\env(t)$ takes the form of \begin{equation} \env(t)\,=\,\sca\,\cdot\,\soc(t), \qquad \env(t)\,>\,0\enspace\forall\enspace t\,\in\,\mathbb{R} \label{eq:toy_env_pure} \end{equation} The standard deviation of $\env(t)$ increases linearly with $\sca$ on a double-logarithmic scale and is slightly reduced~(Fig.\,\ref{fig:log-hp}c) compared to the input $\filt(t)$, which is consistent with the results of the previous analysis~(Fig.\,\ref{fig:rect-lp}c). By conversion of $\env(t)$ to decibel scale, $\sca$ turns from a multiplicative scale in linear space into an additive term, or offset, in logarithmic space: \begin{equation} \db(t)\,=\,20\,\cdot\,\dec \left[\,\sca\,\cdot\,s(t)\,\right]\,=\,20\,\cdot\,\left[\dec \sca\,+\,\dec s(t)\right], \qquad \sca\,>\,0 \label{eq:toy_log_pure} \end{equation} The highpass filtering of $\db(t)$ can be approximated as a subtraction of the local signal offset within a suitable time interval $0 \ll \thp < \frac{1}{\fc}$: \begin{equation} \begin{split} \adapt(t)\,\approx\,\db(t)\,-\,20\,\cdot\,\dec \sca\,=\,20\,\cdot\,\dec s(t) \end{split} \label{eq:toy_highpass_pure} \end{equation} This eliminates $\sca$ from $\adapt(t)$ and thus renders it perfectly intensity-invariant, with a constant standard deviation of around 10\,dB across all $\sca>0$~(Fig.\,\ref{fig:log-hp}c). In contrast, in the noisy case~(Fig.\,\ref{fig:log-hp}b), $\env(t)$ takes the form of \begin{equation} \env(t)\,=\,\sca\,\cdot\,\soc(t)\,+\,\noc(t), \qquad \env(t)\,>\,0\enspace\forall\enspace t\,\in\,\mathbb{R} \label{eq:toy_env_noise} \end{equation} Similar to the previous analysis~(Fig.\,\ref{fig:rect-lp}d), the ratio of the standard deviation of $\env(t)$ to its pure-noise reference standard deviation on a double-logarithmic scale follows a constant regime for small $\sca$ and a linearly increasing regime for larger $\sca$~(Fig.\,\ref{fig:log-hp}d). Decibel conversion of $\env(t)$ % \begin{equation} % \begin{split} % \db(t)\,&=\,20\,\cdot\,\dec \left[\,\sca\,\cdot\,s(t)\,+\,\eta(t)\,\right]\\ % &=\,20\,\cdot\,\left(\dec \sca\,+\,\dec \left[s(t)\,+\,\frac{\eta(t)}{\sca}\right]\right), \qquad \sca\,>\,0 % \end{split} % \label{eq:toy_log_noise} % \end{equation} \begin{equation} \db(t)\,=\,20\,\cdot\,\left(\dec \sca\,+\,\dec \left[s(t)\,+\,\frac{\eta(t)}{\sca}\right]\right), \qquad \sca\,>\,0 \label{eq:toy_log_noise} \end{equation} allows for the separation of $\sca$ from $\soc(t)$ but introduces a scaling of $\noc(t)$ by the inverse of $\sca$, which remains present even after the offset subtraction: \begin{equation} \begin{split} \adapt(t)\,\approx\,20\,\cdot\,\dec\left[s(t)\,+\,\frac{\eta(t)}{\sca}\right] \end{split} \label{eq:toy_highpass_noise} \end{equation} % \begin{equation} % \begin{split} % \adapt(t)\,\approx\,\db(t)\,-\,20\,\cdot\,\dec \sca\,=\,20\,\cdot\,\dec\left[s(t)\,+\,\frac{\eta(t)}{\sca}\right] % \end{split} % \label{eq:toy_highpass_noise} % \end{equation} This means that, in the noisy case, $\sca$ cannot be entirely eliminated from $\adapt(t)$, only redistributed between $\soc(t)$ and $\noc(t)$. If $\sca$ is sufficiently large ($\sca\gg1$, saturation regime), $\noc(t)$ is attenuated to the point of being negligible, so that $\adapt(t)$ is a scale-free representation of $\soc(t)$. If $\sca$ and $\noc(t)$ are at similar scales ($\sca\approx1$, transient regime), $\adapt(t)$ largely resembles $\db(t)$. Finally, if $\sca$ is sufficiently small ($0<\sca\ll1$, noise regime), $\noc(t)$ masks $\soc(t)$ even after the intensity adaptation. Accordingly, the effective intensity invariance of $\adapt(t)$ through logarithmic compression and adaptation is limited by the SNR of $\env(t)$: Songs that have already sunken into the noise floor at the level of $\env(t)$ cannot be recovered by subsequent processing steps. The general pattern of noise regime, transient regime, and saturation regime remains consistent across different species~(Fig.\,\ref{fig:log-hp}e). However, the saturation point --- the $\sca$ value at which the SNR of $\adapt(t)$ starts to saturate --- and the saturation level --- the constant SNR of $\adapt(t)$ within the saturation regime --- vary considerably between and within species~(appendix Figs.\,\ref{fig:app_log-hp_curves}+\ref{fig:app_log-hp_saturation}). For example, \textit{C. biguttulus} and \textit{C. mollis} display a noticably lower saturation level compared to other species. These differences are not to be underestimated, since the saturation level of $\adapt(t)$ determines the maximum input SNR for subsequent processing steps. In other words, the fact that $\adapt(t)$ eventually reaches a saturation regime is, of course, desirable in the context of intensity invariance, but it also means to pass up on the higher SNR values that are achieved by $\env(t)$ for the same $\sca$ (up to several orders of magnitude, Fig.\,\ref{fig:log-hp}d). This trade-off between intensity invariance and SNR is a recurring phenomenon that is further addressed in the following sections. \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_invariance_log_hp.pdf} \caption{\textbf{Intensity invariance through logarithmic compression and adaptation is restricted by the noise floor and decreases SNR.} Input $\filt(t)$ consists of $\soc(t)$ scaled by $\sca$ with optional $\noc(t)$ and is successively transformed into envelope $\env(t)$, logarithmically compressed envelope $\db(t)$, and intensity-adapted envelope $\adapt(t)$. \textbf{Top}:~Examples of $\env(t)$, $\db(t)$, and $\adapt(t)$ for different $\sca$. \textbf{a}:~Noiseless case. \textbf{b}:~Noisy case. \textbf{Bottom}:~Intensity measures over $\sca$. \textbf{c}:~Noiseless case: Standard deviation $\sigma_x$ of $\env(t)$, $\db(t)$, and $\adapt(t)$, respectively. \textbf{d}:~Noisy case: Ratio of $\sigma_x$ to the respective pure-noise reference $\sigma_{\eta}$ for $\sca=0$. Shaded areas indicate $5\,\%$ (dark grey) and $95\,\%$ (light grey) curve span for $\adapt(t)$. \textbf{e}:~Ratio of $\sigma_x$ to $\sigma_{\eta}$ of $\adapt(t)$ as in \textbf{d} for different species (averaged over songs and recordings, appendix Fig.\,\ref{fig:app_log-hp_curves}). Dots indicate $95\,\%$ curve span per species. } \label{fig:log-hp} \end{figure} \FloatBarrier \subsubsection{Thresholding nonlinearity \& temporal averaging} The third nonlinear transformation along the model pathway is the thresholding nonlinearity $\nl$ that transforms each kernel response $c_i(t)$ into a binary binary response $b_i(t)$, Eq.\,\ref{eq:binary}. This transformation takes place after the convolutional filtering of $\adapt(t)$ with kernel $k_i(t)$, Eq.\,\ref{eq:conv}, and is followed by the temporal averaging of $b_i(t)$ into the feature set $f_i(t)$ by a lowpass filter, Eq.\,\ref{eq:lowpass}. The effects of thresholding and temporal averaging are best illustrated based on a single kernel~(Fig.\,\ref{fig:thresh-lp_single}) instead of the full set. For this analysis, input $\adapt(t)$ was rescaled~(Fig.\,\ref{fig:thresh-lp_single}a) and convolved with kernel $k(t)$. The resulting kernel response $c(t)$ was passed through $H(c\,-\,\Theta)$ with three different threshold values $\Theta$~(Fig.\,\ref{fig:thresh-lp_single}b-d). Each resulting binary response $b(t)$ was transformed into $f(t)$, whose average feature value $\mu_f$ serves as a measure of intensity~(Fig.\,\ref{fig:thresh-lp_single}ef). The thresholding nonlinearity $H(c\,-\,\Theta)$ categorizes the values of $c(t)$ into "relevant" ($c(t)>\Theta$, $b(t)=1$) and "irrelevant" ($c(t)\leq\Theta$, $b(t)=0$) response values. It thereby splits the probability density $\pc$ of $c(t)$ within some observed time interval $T$ into two complementary parts around $\Theta$: \begin{equation} \int_{\Theta}^{+\infty} \pc\,dc\,=\,1\,-\,\int_{-\infty}^{\Theta} \pc\,dc\,=\,\frac{T_1}{T}, \qquad \infint \pc\,dc\,=\,1 \label{eq:pdf_split} \end{equation} The right-sided part of the split $\pc$ corresponds to time $T_1$ where $c(t)>\Theta$, while the left-sided part corresponds to time $T_0=T-T_1$ where $c(t)\leq\Theta$. The semi-definite integral over the right-sided part of $\pc$ represents the ratio of time $T_1$ to total time $T$ because the indefinite integral of a probability density is normalized to 1. The lowpass filtering of $b(t)$ can be approximated as temporal averaging over a suitable time interval $\tlp>\frac{1}{\fc}$ in order to express $f(t)$ as a similar temporal ratio \begin{equation} f(t)\,\approx\,\frac{1}{\tlp} \int_{t}^{t\,+\,\tlp} b(\tau)\,d\tau\,=\,\frac{T_1}{\tlp}, \qquad b(t)\,\in\,\{0,\,1\} \label{eq:feat_avg} \end{equation} of time $T_1$ during which $b(t)$ is 1 within the averaging interval $\tlp$. Therefore, the value of $f(t)$ at every time point $t$ approximately signifies the cumulative probability that $c(t)$ exceeds $\Theta$ during the corresponding averaging interval $\tlp$: \begin{equation} f(t)\,\approx\,\int_{\Theta}^{+\infty} \pclp\,dc\,=\,P(c\,>\,\Theta,\,\tlp) \label{eq:feat_prop} \end{equation} In a sense, $f(t)$ can be interpreted as some sort of duty cycle with respect to $\Theta$. For example, a feature value of $f(t)=0.4$ means that $c(t)$ exceeds $\Theta$ for approximately 40\,\% of the time within $\tlp$ around $t$. In the most extreme cases, $\Theta$ lays either above the maximum of $c(t)$ or below the minimum of $c(t)$, which results in a minimum or maximum possible feature value of $f(t)=0$~(Fig.\,\ref{fig:thresh-lp_single}d, left column) or $f(t)=1$, respectively. Importantly, $f(t)$ neither retains information about the timing of individual threshold crossings nor the precise values of $c(t)$ apart from their relation to $\Theta$. Accordingly, for a given $\Theta$, different $\sca$ can still result in similar $T_1$ segments (and hence similar feature values) depending on the magnitude of the derivative of $c(t)$ in temporal proximity to time points at which $c(t)$ crosses $\Theta$: The steeper the slope of $c(t)$, the less $T_1$ changes with variations in $\sca$. The most reliable way of exploiting this invariant porperty of $f(t)$ is to set $\Theta$ to a value near 0, because these values are least affected by different scales of $c(t)$. For sufficiently large $\sca$, $f(t)$ then approaches the same constant $\mu_f$ in both the noiseless and the noisy case~(Fig.\,\ref{fig:thresh-lp_single}e, saturation regime). The saturation level of $f(t)$ is independent of the precise value of $\Theta$, but the saturation point decreases with $\Theta$~(Fig.\,\ref{fig:thresh-lp_single}e). Therefore, a threshold value of $\Theta=0$ would be the optimal choice for achieving intensity invariance at the lowest possible $\sca$. In stark contrast, the closer $\Theta$ is to 0, the higher $\mu_f$ in response to the pure noise component $\noc(t)$ and the lower the resulting SNR of $f(t)$ between noise regime and saturation regime~(Fig.\,\ref{fig:thresh-lp_single}b-d, left column, and Fig.\,\ref{fig:thresh-lp_single}e). This trade-off between intensity invariance and SNR has already been observed during the previous analysis on logarithmic compression and adaptation~(Fig.\,\ref{fig:log-hp}d). Finally, the effects of thresholding and temporal averaging must be seen in the context of the previous transformation pair of logarithmic compression and adaptation: In the current analysis, the input $\adapt(t)$ can be rescaled by arbitrarily large $\sca$, while in the full pathway, the current input $\adapt(t)$ is the output $\adapt(t)$ of the previous transformation pair and is hence capped to a maximum standard deviation of around 10\,dB~(Fig.\,\ref{fig:log-hp}cd). This can be illustrated by plotting $\mu_f$ not over $\sca$~(Fig.\,\ref{fig:thresh-lp_single}e) but over the standard deviation of input $\adapt(t)$ instead~(Fig.\,\ref{fig:thresh-lp_single}f). It becomes apparent that $\mu_f$ saturates only for standard deviations of $\adapt(t)$ that would already be capped. Accordingly, $f(t)$ never reaches the saturation regime as determined by the current transformation pair but rather adheres to the saturation regime determined by the previous transformation pair. In this case, the saturated $\mu_f$ is not independent of $\Theta$ anymore. The consequences of this interaction between the two mechanisms of intensity invariance are further explored in a later section. \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_invariance_thresh_lp_single.pdf} \caption{\textbf{Intensity invariance through thresholding and temporal averaging is mediated by the interaction of threshold value and noise floor.} Input $\adapt(t)$ consists $\soc(t)$ scaled by $\sca$ with optional $\noc(t)$ and is transformed into single kernel response $c(t)$, binary response $b(t)$, and feature $f(t)$. Different color shades indicate different threshold values $\Theta$ (multiples of pure-noise standard deviation $\sigma_{\eta}$ of $c(t)$ for $\sca=0$, with darker colors for higher $\Theta$. See also appendix Fig.\,\ref{fig:app_thresh-lp_kern-sd}). \textbf{Left}:~Noisy case: Examples of $\adapt(t)$ as well as $c(t)$, $b(t)$, and $f(t)$ for different $\sca$. \textbf{a}:~$\adapt(t)$ with kernel $k(t)$ in black. \textbf{b\,-\,d}: $c(t)$, $b(t)$, and $f(t)$ based on the same $\adapt(t)$ from \textbf{a} but for different $\Theta$. \textbf{Right}:~Average value $\mu_f$ of $f(t)$ for each $\Theta$ from \textbf{b\,-\,d}. Dots indicate $95\,\%$ curve span (noisy case). \textbf{e}:~$\mu_f$ over $\sca$, once for the noisy case (solid lines) and once for the noiseless case (dotted lines). \textbf{f}:~Noisy case: $\mu_f$ over standard deviation $\sigma_{\text{adapt}}$ of input $\adapt$ corresponding to $\sca$ shown in \textbf{e}. Shaded area indicates values of $\sigma_{\text{adapt}}$ that are capped in the output $\adapt(t)$ of the previous transformation pair (Fig.\,\ref{fig:log-hp}cd). } \label{fig:thresh-lp_single} \end{figure} \FloatBarrier \subsection{Intensity invariance of species-specific feature representations} Having established both the meaning of the feature value and the mechanism of intensity invariance by thresholding and temporal averaging, the question remains how this mechanism acts on a set of features $f_i(t)$ based on different species-specific songs~(Fig.\,\ref{fig:thresh-lp_species}a). The previous analysis was repeated with three different kernels $k_i(t)$ using a single kernel-specific threshold value $\thr$; and the resulting average feature values $\muf$ were plotted over $\sca$~(Fig.\,\ref{fig:thresh-lp_species}bc). Additionally, 2D feature spaces spanned by each pair of $f_i(t)$ were plotted to investigate the separability of species-specific songs based on the feature representation in dependence of $\sca$~(Fig.\,\ref{fig:thresh-lp_species}de). Each species-specific combination of $\muf$ follows a trajectory through feature space that develops with $\sca$. These trajectories correspond to the transient regime between the constant (noise) regime and the saturation regime, which are only visible as the start and end points of the trajectories, respectively. The horizontal dashes in the colorbars indicate the range of $\sca$ that corresponds to the transient regime across $f_i(t)$ for each species. In the noiseless case, each $\muf$ is 0 for small $\sca$ across all species~(Fig.\,\ref{fig:thresh-lp_species}b) because $c_i(t)$ never exceeds $\thr$. Accordingly, each trajectory starts at the origin of the feature space~(Fig.\,\ref{fig:thresh-lp_species}d). For larger $\sca$, all $\muf$ saturate at individual values whose combination differs between species, so that the songs of each species are eventually represented by distinct points in feature space. However, the species-specific trajectories cross each other at numerous points, which means that the songs of two species --- each at a specific $\sca$ --- can result in the same combination of $\muf$. Furthermore, the specific saturation point of $f_i(t)$ depends on the species: For \textit{C. mollis}, all $\muf$ saturate around the same $\sca$, while \textit{O. rufipes} exhibits considerable variation between the three $f_i(t)$. The larger the variation in saturation points between $f_i(t)$, the stronger the curvature of the trajectory through feature space. In the noisy case, $\muf$ is non-zero even for the smallest $\sca$~(Fig.\,\ref{fig:thresh-lp_species}c) because the addition of the noise component $\noc(t)$ to input $\adapt(t)$ drives $c_i(t)$ above $\thr$ regardless of the song component $\soc(t)$. The starting value of $\muf$ is the same across all $f_i(t)$ and species by construction of the specific $\thr$. In consequence, the trajectories through feature space do not start at the origin but rather at approximately the same point along the diagonal~(Fig.\,\ref{fig:thresh-lp_species}e). For larger $\sca$, all $\muf$ saturate at the same values as in the noiseless case, as expected from the previous analysis~(Fig.\,\ref{fig:thresh-lp_single}e). However, the trajectories now move a much shorter distance through feature space for a similar range of $\sca$ due to the lower SNR of $f_i(t)$ between noise regime and saturation regime, which increases the likelihood of trajectories crossing each other. Finally, the saturation points of $f_i(t)$ for a given species are slightly higher in the noisy case, but the variation between $f_i(t)$ remains largely unchanged. In summary, even a comparably small set of three features $f_i(t)$ can, in principle, represent different species-specific songs at distinct points in feature space, regardless of the presence of noise. However, this only holds for sufficiently large $\sca$ that allow $f_i(t)$ to reach a saturation regime. During the transient regime, the species-specific combination of $\muf$ can very well be the same for two or more different species at specific $\sca$, although this may be alleviated by the inclusion of additional $f_i(t)$. Overall, the results of this analysis suggest that $\thr$ should rather be choosen in favor of a higher SNR ($\thr$ just above pure-noise $c_i(t)$) than a lower saturation point ($\thr\to0$). First, because this reduces the density of trajectories through feature space, and second, because the capping of $\adapt(t)$ by the previous transformation pair likely renders the saturation point of $f_i(t)$ less relevant. \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_invariance_thresh_lp_species.pdf} \caption{\textbf{Feature representation of different species-specific songs saturates at different points in feature space.} Same input and processing as in Fig.\,\ref{fig:thresh-lp_single} but with three different kernels $k_i$ and a single kernel-specific threshold value $\thr=0.5\cdot\sigma_{\eta_i}$ (appendix Fig.\,\ref{fig:app_thresh-lp_kern-sd}). \textbf{a}:~Examples of species-specific grasshopper songs. \textbf{Middle}:~Average value $\muf$ of each feature $f_i(t)$ over $\sca$ per species (averaged over songs and recordings, appendix Figs.\,\ref{fig:app_thresh-lp_pure} and \ref{fig:app_thresh-lp_noise}). Different color shades indicate different $k_i$. Dots indicate $95\,\%$ curve span per $k_i$. \textbf{b}:~Noiseless case. \textbf{c}:~Noisy case. \textbf{Bottom}:~2D feature spaces spanned by each pair of $f_i(t)$. Each trajectory corresponds to a species-specific combination of $\muf$ that develops with $\sca$ (colorbars). Horizontal dashes in the colorbar indicate $5\,\%$ (dark grey) and $95\,\%$ (light grey) curve span of the norm across all three $\muf$ per species. \textbf{d}:~Noiseless case. \textbf{e}:~Noisy case. Shaded areas indicate the average minimum $\muf$ across all species-specific trajectories. } \label{fig:thresh-lp_species} \end{figure} \FloatBarrier \subsection{Intensity invariance along the full model pathway} Through the previous analyses, we could establish two mechanisms of intensity invariance: Logarithmic compression and adaptation as well as thresholding and temporal averaging. While each transformation pair by itself can provide some level of invariance, certain results suggest that the first mechanism may actually limit or even nullify the effect of the second mechanism. In the following sections, we investigate the combined effect of both mechanisms along the full model pathway~(Fig.\,\ref{fig:pipeline_full}) and explore the consequences of disabling the first mechanism by skipping the logarithmic compression step~(Fig.\,\ref{fig:pipeline_short}). \subsubsection{Including logarithmic compression} For this analysis, input $\raw(t)$ --- including both song component $\soc(t)$ and noise component $\noc(t)$ --- was rescaled and processed throughout all steps of the model pathway~(Fig.\,\ref{fig:pipeline_full}a) up to the feature set $f_i(t)$. As before, the standard deviation was used as intensity measure for each resulting representation except $b_i(t)$ and $f_i(t)$. For $f_i(t)$, the average feature value $\muf$ was used, while $b_i(t)$ was omitted from the analysis. Plotting each intensity measure over $\sca$~(Fig.\,\ref{fig:pipeline_full}b) reinforces many of the previous observations. For ease of visualization, the kernel-specific curves for $c_i(t)$ and $f_i(t)$ were summarized by their median. Representations prior to logarithmic compression --- $\filt(t)$ and $\env(t)$ --- show a linear increase of the intensity measure for larger $\sca$ on a double-logarithmic scale. Representations after logarithmic compression --- $\db(t)$, $\adapt(t)$, and $c_i(t)$ --- are the first to reach a saturation regime and do so at approximately the same $\sca$ because they are separated only by linear transformations. Feature set $f_i(t)$ reaches a saturation regime, as well. But contrary to previous results, the saturation point of $f_i(t)$ appears below that of $c_i(t)$, which suggests that the second mechanism of thresholding and temporal averaging can indeed improve intensity invariance beyond the first mechanism of logarithmic compression and adaptation. The difference in saturation points is best illustrated based on the ratio of each intensity measure to the respective pure-noise reference value~(Fig.\,\ref{fig:pipeline_full}d). However, compressing $f_i(t)$ into a median across $k_i(t)$ conceils many kernel-specific details. It is therefore necessary to consider the development of each $f_i(t)$ over $\sca$ separately~(Fig.\,\ref{fig:pipeline_full}c). Indeed, all 40 $f_i(t)$ in the set reach a saturation regime for sufficiently large $\sca$. The saturated $\muf$ are distributed over a range of values --- which is the prerequisite for forming species-specific combinations --- but are limited to a rather small subset of possible values between 0 and 1. Based on previous results~(Fig.\,\ref{fig:thresh-lp_single}f), this is likely due to the capping of $\adapt(t)$ that prevents $f_i(t)$ from reaching its intrinsic saturation value; but this cannot be confirmed until the following analysis~(Fig.\,\ref{fig:pipeline_short}). Looking at the kernel-specific SNR values of $c_i(t)$ over $\sca$~(Fig.\,\ref{fig:pipeline_full}e) and $f_i(t)$ over $\sca$~(Fig.\,\ref{fig:pipeline_full}f) reveals a high degree of variation between different $k_i(t)$. Certain $f_i(t)$ achieve much higher SNR values than $c_i(t)$ for the same $\sca$ due to the former's capacity for arbitrarily low pure-noise responses ($\muf\to0$) and hence arbitrarily high SNR values. Finally, the question remains whether the suspected improvement of intensity invariance by $f_i(t)$ beyond $c_i(t)$ holds at the level of individual $k_i(t)$. The single saturation points based on the median across $k_i(t)$ for $c_i(t)$ and $f_i(t)$ are expanded into distributions of kernel-specific saturation points~(Fig.\,\ref{fig:pipeline_full}g). For $c_i(t)$, the distribution is rather narrow and corresponds well to the single saturation point based on the median. For $f_i(t)$, however, the distribution is much broader and is not centered around the single saturation point based on the median but rather shifted towards lower $\sca$. Care must be taken when interpreting the height of either distribution due to the logarithmic scaling of the underlying $\sca$ axis. Nevertheless, the overall pattern suggests that the saturation points of specific $f_i(t)$ are indeed lower than those of their $c_i(t)$ counterparts. Therefore, the effect of thresholding and temporal averaging on intensity invariance is not necessarily nullified by the previous logarithmic compression and adaptation. \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_invariance_full_Omocestus_rufipes.pdf} \caption{\textbf{Step-wise emergence of intensity-invariant song representations along the model pathway.} Input $\raw(t)$ consists of $\soc(t)$ scaled by $\sca$ with added $\noc(t)$ and is processed up to the feature set $f_i(t)$ using kernel-specific threshold values $\thr=2\cdot\sigma_{\eta_i}$ (appendix Fig.\,\ref{fig:app_full_kern-sd}). Different color shades indicate different types of Gabor kernels with specific lobe number $\kn$ and either $+$ or $-$ sign, sorted (dark to light) first by increasing $\kn$ and then by sign~($1\,\leq\,\kn\,\leq\,4$; first $+$, then $-$ for each $\kn$; five kernel widths $\kw$ of 1, 2, 4, 8, and $16\,$ms per type; 8 types, 40 $k_i(t)$ in total). \textbf{a}:~Examples of $\filt(t)$, $\env(t)$, $\db(t)$, $\adapt(t)$, $c_i(t)$, and $f_i(t)$ for different $\sca$. \textbf{b}:~Intensity measures over $\sca$. The median over $k_i(t)$ is shown for $c_i(t)$ and $f_i(t)$. Dots indicate $95\,\%$ curve span for $\db(t)$, $\adapt(t)$, $c_i(t)$, and $f_i(t)$. \textbf{c}:~Average value $\muf$ of each $f_i(t)$ over $\sca$. \textbf{d}:~Ratio of intensity measures from \textbf{b} to the respective pure-noise reference for $\sca=0$. \textbf{e}:~Ratio of standard deviation $\sigma_{c_i}$ of each $c_i(t)$. \textbf{f}:~Ratio of $\muf$. \textbf{g}:~Distributions of kernel-specific $\sca$ that correspond to $95\,\%$ curve span for $c_i(t)$ and $f_i(t)$. Dots indicate values based on the median from \textbf{b}. } \label{fig:pipeline_full} \end{figure} \FloatBarrier \subsubsection{Excluding logarithmic compression} The previous analysis was repeated in exactly the same way as before, except that the logarithmic compression of $\env(t)$, Eq.\,\ref{eq:log}, was skipped in order to disable the first mechanism of intensity invariance. Consequently, $\adapt(t)$ is merely a highpass filtered version of $\env(t)$; and $\db(t)$ is missing entirely~(Fig.\,\ref{fig:pipeline_short}a). As expected, all representations prior to the thresholding nonlinearity $\nl$ --- $\filt(t)$, $\env(t)$, $\adapt(t)$, and $c_i(t)$ --- show a linear increase of the intensity measure for larger $\sca$, while $f_i(t)$ is the only representation to reach a saturation regime~(Fig.\,\ref{fig:pipeline_short}bd). The saturated $\muf$ are distributed over a much broader range of values than in the previous analysis~(Fig.\,\ref{fig:pipeline_short}c). Intriguingly, the distribution of $\muf$ is symmetric around a value of 0.5. This is relevant because every kernel $k^+(t)$ in the underlying kernel set has a counterpart of opposite sign that is otherwise identical, so that $k^+(t)=-k^-(t)$. The responses of $k^+(t)$ and $k^-(t)$ to the same input $\adapt(t)$ are also inverted because convolution is a linear operation: $c^+(t)=-c^-(t)$. The distributions of $c^+(t)$ and $c^-(t)$ are hence inverted to each other, as well: $p(c^+)=p(-c^-)$. Based on Eq.\,\ref{eq:feat_prop}, transforming $c^+(t)$ and $c^-(t)$ further using the same $\Theta$ thus results in two complementary features $f^+(t)$ and $f^-(t)$ that are symmetric around 0.5, so that $f^+(t)=1-f^-(t)$. Of course, this symmetry throughout the feature representation goes hand in hand with a substantial degree of redundancy and is hardly expected to be present in the actual grasshopper auditory system. But the fact that the saturated $\muf$ are distributed symmetrically around 0.5 provides concrete evidence that each $f_i(t)$ is able to reach its intrinsic saturation level in the absence of logarithmic compression~(Fig.\,\ref{fig:pipeline_short}c), which is otherwise prevented by the capping of $\adapt(t)$, as seen during previous analyses~(Fig.\,\ref{fig:thresh-lp_single}f and Fig.\,\ref{fig:pipeline_full}c). Otherwise, there appear to be no major differences in the development of $f_i(t)$ over $\sca$ compared to the previous analysis, neither on the kernel-specific SNR values~(Fig.\,\ref{fig:pipeline_short}e) nor on the distribution of kernel-specific saturation points~(Fig.\,\ref{fig:pipeline_short}f). Overall, the most substantial consequence of skipping the logarithmic compression is that it allows $f_i(t)$ to reach its intrinsic saturation value. If this results in a wider range of $\muf$ across the feature set, it should be benefitial for forming species-specific combinations. However, this depends on multiple different factors such as the choice of $k_i(t)$ and $\thr$ as well as the structure and distribution of the specific song and is hence not guaranteed simply by disabling logarithmic compression. \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_invariance_short_Omocestus_rufipes.pdf} \caption{\textbf{Effects of disabling logarithmic compression on intensity invariance along the model pathway.} Same input and processing as in Fig.\,\ref{fig:pipeline_full}, using kernel-specific threshold values $\thr=2\cdot\sigma_{\eta_i}$ (appendix Fig.\,\ref{fig:app_short_kern-sd}), except that logarithmic compression and hence $\db(t)$ are skipped. \textbf{a}:~Examples of $\filt(t)$, $\env(t)$, $\adapt(t)$, $c_i(t)$, and $f_i(t)$ for different $\sca$. \textbf{b}:~Intensity measures over $\sca$. The median over $k_i(t)$ is shown for $c_i(t)$ and $f_i(t)$. Dot indicates $95\,\%$ curve span for $f_i(t)$. \textbf{c}:~Average value $\muf$ of each $f_i(t)$ over $\sca$. \textbf{d}:~Ratio of intensity measures from \textbf{b} to the respective pure-noise reference for $\sca=0$. \textbf{e}:~Ratio of $\muf$. \textbf{f}:~Distribution of kernel-specific $\sca$ that correspond to $95\,\%$ curve span for $f_i(t)$. Dot indicates value based on the median from \textbf{b}. } \label{fig:pipeline_short} \end{figure} \FloatBarrier \subsubsection{Intensity invariance in a naturalistic setting} % This one appears...meh? So far, the analyses on intensity invariance were based on synthetically generated input signals, since these allow for a systematic manipulation of the mixture of song component $\soc(t)$ and noise component $\noc(t)$ over an arbitrary range of scales $\sca$. Now, the question remains how the model pathway performs under more naturalistic conditions. The previous analysis of the full model pathway~(Fig.\,\ref{fig:pipeline_full}) was hence repeated, using field recordings of a song of \textit{P. parallelus} as input $\raw(t)$ and a segment of background noise from the same recordings as pure-noise reference. Recordings were taken simultaneously at eight different distances $d$ from the sender, ranging from $10\,$cm to $220\,$cm with intervals of $30\,$cm between microphones. The precise value of $\sca$ that corresponds to a given $d$ cannot be determined in a straightforward manner, but $\sca$ is expected to be inversely proportional to $d$ based on the inverse-square law of sound propagation. All intensity measures and ratios thereof were hence plotted over $1/d$ on a double-logarithmic scale, which is insofar comparable to previous analyses that a decade on the $1/d$ axis corresponds to a decade on the $\sca$ axis. To complicate matters further, the $1/d$ axis is sampled too sparsely to determine saturation points as before based on the $95\,\%$ curve span. Instead, one has to rely on the slope of the curve to assess if, and at which $1/d$, a given representation reaches a saturation regime. Bearing these limitations in mind, the intensity measures of each representation over $1/d$~(Fig.\,\ref{fig:pipeline_field}b) follow a pattern that is consistent with the results of the previous simulation-based analysis~(Fig.\,\ref{fig:pipeline_full}b): The standard deviations of $\filt(t)$ and $\env(t)$ increase linearly with $1/d$, respectively. The standard deviations of $\db(t)$, $\adapt(t)$, and $c_i(t)$ show a weaker increase with $1/d$ and appear to approach, but not reach, a saturation regime for larger $1/d$. The average feature values $\muf$ of $f_i(t)$ show an even weaker increase with $1/d$ and appear to reach a saturation regime for $d=40\,$cm and $d=10\,$cm, which is consistent across most $f_i(t)$ in the set~(Fig.\,\ref{fig:pipeline_field}c). Saturation of $f_i(t)$ without saturation of $c_i(t)$ suggests that the input $\raw(t)$ at the smallest $d=10\,$cm corresponds to a value of $\sca$ between 10 and 20 based on comparison with the simulation-based analysis~(Fig.\,\ref{fig:pipeline_full}b). The saturated $\muf$ are distributed over a comparably narrow range of values, which could in parts be a property of the songs of \textit{P. parallelus}~(see also Fig.\,\ref{fig:thresh-lp_species}bc). The ratios of each intensity measure to the respective pure-noise reference value are not aligned across representations~(Fig.\,\ref{fig:pipeline_field}d) or kernels~(Fig.\,\ref{fig:pipeline_field}ef) but serve to consolidate the previous observation that only $f_i(t)$ exhibits some degree of intensity invariance within the available range of $1/d$. Based on the current results, this intensity invariance of $f_i(t)$ in the field holds up to a distance of around $40\,$cm from the sender, decays steadily between $40\,$cm and $130\,$cm, and is substantially dimished for larger distances~(Fig.\,\ref{fig:pipeline_field}a, bottom row). \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_invariance_field.pdf} \caption{\textbf{Intensity invariance along the model pathway in a naturalistic setting.} Input $\raw(t)$ consists of a song of \textit{P. parallelus} recorded in the field at eight different distances $d$ and is processed up to the feature set $f_i(t)$ using kernel-specific threshold values $\thr=2\cdot\sigma_{\eta_i}$ (appendix Fig.\,\ref{fig:app_field_kern-sd}). Different color shades indicate different types of Gabor kernels with specific lobe number $\kn$ and either $+$ or $-$ sign, sorted (dark to light) first by increasing $\kn$ and then by sign~($1\,\leq\,\kn\,\leq\,4$; first $+$, then $-$ for each $\kn$; five kernel widths $\kw$ of 1, 2, 4, 8, and $16\,$ms per type; 8 types, 40 $k_i(t)$ in total). \textbf{a}:~$\filt(t)$, $\env(t)$, $\db(t)$, $\adapt(t)$, $c_i(t)$, and $f_i(t)$ at each $d$. A noise segment from the same recording is shown for reference. \textbf{b}:~Intensity measures over $d$. The median over $k_i(t)$ is shown for $c_i(t)$ and $f_i(t)$. \textbf{c}:~Average value $\muf$ of each $f_i(t)$ over $d$. \textbf{d}:~Ratio of intensity measures from \textbf{b} to the respective value obtained from the noise reference. \textbf{e}:~Ratio of standard deviation $\sigma_{c_i}$ of each $c_i(t)$. \textbf{f}:~Ratios of $\muf$. } \label{fig:pipeline_field} \end{figure} \FloatBarrier \subsection{Interspecific and intraspecific feature variability} In the final analysis of the current study, we investigated the variability of songs in the feature representation between different species and within the same species~(Fig.\,\ref{fig:feat_cross_species}). Naturally, a feature representation that is both consistent across different songs of the same species and sufficiently different between songs of different species is a fundamental prerequisite for species-specific song recognition. The data used in this analysis corresponds to the saturated $\muf$ of each $f_i(t)$ from the previous analysis of the full model pathway~(Fig.\,\ref{fig:pipeline_full}c), using different songs of \textit{O. rufipes} for the intraspecific comparisons and single songs from a number of species for the interspecific comparisons (also shown in Fig.\,\ref{fig:thresh-lp_species}a). Accordingly, each song is represented by 40 values of $\muf$ based on the same set of $f_i(t)$. For each comparison, $\muf$ from one song was plotted against $\muf$ from the other song, so that each dot within a subplot corresponds to a single feature $f_i(t)$. For the intraspecific comparisons~(Fig.\,\ref{fig:feat_cross_species}, upper triangular), the pairs of $\muf$ are distributed closely around the diagonal, with a minimum correlation coefficient of $\rho=0.82$, a maximum of $\rho=0.99$, and a median of $\rho=0.91$. A given $f_i(t)$ thus tends to have a similar $\muf$ across different songs of the same species. In contrast, the pairs of $\muf$ for the interspecific comparisons~(Fig.\,\ref{fig:feat_cross_species}, lower triangular) are distributed in a variety of different ways, most in broader clouds (e.g. \textit{C. biguttulus} vs. \textit{C. mollis}) but some more narrowly around the diagonal (e.g. \textit{P. parallelus} vs. \textit{C. dispar}). The correlation coefficients $\rho$ vary widely between different interspecific comparisons, with a minimum of $\rho=-0.1$, a maximum of $\rho=0.91$, and a median of $\rho=0.40$. A given $f_i(t)$ therefore tends to have a less similar $\muf$ across different species than within the same species, although certain exeptions exist~(Fig.\,\ref{fig:feat_cross_species}, lower right). Accordingly, the feature representation that is generated by the model pathway is, in principle, suitable for the distinction between different species-specific songs. However, even the songs of the same species are subject to considerable variability in various aspects and depending on a multitude of external and internal factors, which cannot be fully captured based on a limited number of songs. The results of the current analysis are hence to be treated as a proof-of-concept that paves the way towards more comprehensive investigations on the details of song representation in feature space, including the effects of different parameters of the model pathway as well as the inclusion of additional songs and species to reflect the complexity of natural song variation. \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_features_cross_species.pdf} \caption{\textbf{Interspecific and intraspecific feature variability.} Average value $\muf$ of each feature $f_i(t)$ against its counterpart from a 2nd feature set based on a different input $\raw(t)$. Data is based on the saturated $\muf$ from Fig.\,\ref{fig:pipeline_full}. Each dot within a subplot represents a single $f_i(t)$. Different color shades indicate different types of Gabor kernels with specific lobe number $\kn$ and either $+$ or $-$ sign, sorted (dark to light) first by increasing $\kn$ and then by sign~($1\,\leq\,\kn\,\leq\,4$; first $+$, then $-$ for each $\kn$; five kernel widths $\kw$ of 1, 2, 4, 8, and $16\,$ms per type; 8 types, 40 kernels in total). \textbf{Lower triangular}:~Interspecific comparisons between single songs of different species. \textbf{Upper triangular}:~Intraspecific comparisons between different songs of a single species (\textit{O. rufipes}). \textbf{Lower right}:~Distribution of correlation coefficients $\rho$ for each interspecific and intraspecific comparison. Dots indicate single $\rho$ values.\\ } \label{fig:feat_cross_species} \end{figure} \FloatBarrier \newpage \section{Discussion} In the current study, we have established a physiologically inspired functional model of the grasshopper song recognition pathway. The model pathway covers the entire auditory processing stream, from the sound reception at the tympanal membrane over peripheral receptor neurons and local interneurons up to the generation of a high-dimensional feature representation at the level of the ascending neurons and beyond in the SEG. Using this model pathway, we have identified two computational key mechanisms for the emergence of intensity-invariant song representations. Each mechanism comprises a nonlinear transformation and a subsequent linear transformation. The first mechanism consists of logarithmic compression and adaptation, which takes place at the level of the receptor neurons and local interneurons. The second mechanism consists of thresholding and temporal averaging, which takes place either at the level of the ascending neurons or further downstream in the SEG. Systematic investigation of both mechanisms revealed a persistent trade-off between the intensity invariance and the SNR of the song representations along the pathway. In the following, we discuss the capabilities and limitations of our model approach as well as the implications of our findings for the design of the grasshopper auditory system, the evolution of species-specific grasshopper songs, and the ethological relevance of intensity invariance in a natural acoustic environment. \subsection{Leveraging functional modelling to investigate sensory systems} Our understanding of sensory processing systems is based on the distributed accumulation of anatomical, physiological, and ethological evidence. Functional modelling provides a powerful tool to integrate the available fragments into a coherent whole. It fasciliates systematic, reproducible investigations of relevant parameters such as scale $\sca$ or threshold value $\thr$. Moreover, it allows to address questions of broader scope by generalizing from concrete evidence. For instance, the interaction between the two mechanisms of intensity invariance is most assessible if both mechanisms can be treated as consecutive stages along the pathway --- where the output of the first stage relates directly to the input of the second stage --- rather than separate entities. The model pathway also provides a general basis for comparing song representations across different species without the need for species-specific models. However, the potential of functional modelling for research on sensory systems depends entirely on the amount of available knowledge about the system. The grasshopper song recognition pathway is a comparably simple and very well-understood system and is therefore a particularly suitable candidate for functional modelling. Other sensory systems that are either more complex or have not been subject to decades of study will likely not be suitable for this approach yet. \subsection{Feature representation, temporal averaging, and song design} \label{sec:constant_feat} The feature set is the final song representation along the model pathway and constitutes the basis for song recognition. Each feature $f_i(t)$ results from the thresholding of the respective kernel response $c_i(t)$ by $\nl$ and the subsequent temporal averaging of binary response $b_i(t)$ by a lowpass filter with extremely low cutoff frequency $\fc$. At a given time point $t$, $f_i(t)$ approximately quantifies the proportion of time during which $c_i(t)$ exceeds the threshold value $\thr$ within the averaging interval $\tlp$ specified by $\fc$. The value of $f_i(t)$ is hence determined by $\thr$ with respect to the distribution $\pci$ of $c_i(t)$ and is restricted to the interval $[0,1]$. Different species-specific songs are represented by different combinations of feature values, which should preferably be constant for the duration of a song to enable reliable recognition. The fundamental requirement for a constant $f_i(t)$ is that the time where $c_i(t)>\thr$ during $\tlp$ is the same for all $t$, which is fulfilled if $\pci$ is stable across $t$. The most straightforward way to achieve a stable $\pci$ is that $c_i(t)$ is periodic and $\tlp$ is sufficiently long to average over multiple cycles of $c_i(t)$. Song-evoked $c_i(t)$ are indeed approximately periodic, which is largely an inherited property of the song itself. Most grasshopper songs are produced by stridulation, which refers to the pulling of the serrated stridulatory file on the hindlegs across a resonating vein on the forewings~(\bcite{helversen1977stridulatory}; \bcite{stumpner1994song}; \bcite{helversen1997recognition}). Every "tooth" that strikes the vein generates a brief sound pulse; multiple pulses make up a syllable; and the repetition of syllables and pauses results in a pattern with a high degree of temporal regularity. Accordingly, a robust feature representation in the sense of constant $f_i(t)$ is tightly linked to the mechanism of sound production and the temporal structure of the generated song. Various grasshopper species, especially those with longer songs like \textit{C. mollis}, \textit{G. rufus}, or \textit{O. rufipes}, tend to stridulate softly at first and then continuously increase the amplitude of their song over time. This slow "ramping" amplitude modulation makes the overall song less periodic despite its temporal regularity. The "ramping" appears more pronounced in $\env(t)$ compared to $\adapt(t)$, which suggests that the logarithmic compression and adaptation during the preprocessing stage might be at least partially beneficial for mitigating the effect of this amplitude modulation on later representations. However, the adaptation of $\adapt(t)$ can only act on certain time scales --- depending on the cutoff frequency of the underlying highpass filter --- and is hence not able to compensate for "ramping" across the entire duration of a song. Certain grasshopper species like \textit{Chorthippus dorsatus} are known to switch their stridulation pattern in the middle of a song~(\bcite{stumpner1994song}). \textit{C. dorsatus} starts stridulating with both hindlegs in synchrony and thereby generates a pronounced syllable-pause pattern similar to that of \textit{P. parallelus}. For the last part of its song, however, \textit{C. dorsatus} switches to an alternating leg movement, which results in a more continuous but not entirely unstructured rattling sound. It is unclear what this composite design means for the feature representation of \textit{C. dorsatus} songs. In principle, both parts of the song could result in similar $\pci$ despite their different temporal structure, which would allow for consistent $f_i(t)$ across the entire song. However, it appears more likely that only one part of the song encodes species identity, while the other part serves a different purpose such as fitness advertisement~(SOURCE?). Finally, the question remains how the choice of an appropriate averaging interval $\tlp$ depends on the duration and temporal structure of a song. The minimum $\tlp$ should encompass at least a few cycles of $c_i(t)$ to ensure a stable $\pci$ and hence a constant $f_i(t)$. The maximum $\tlp$ should not exceed the duration of a song to avoid the inclusion of behaviorally irrelevant information. The longer $\tlp$, the longer $f_i(t)$ takes to stabilize after the onset and before the offset of a song, which narrows the time window for reliable recognition. The duration of species-specific grasshopper songs can range from a few hundred milliseconds (e\,.g \textit{Stethophyma grossum}) to well over a minute (e\,.g. \textit{C. mollis}), so that the optimal $\tlp$ is likely to differ between species. \subsection{Sensory invariances in the grasshopper auditory system} The notion of invariance is fundamental for sensory processing systems. Invariance, in the general sense, can be described as the property of a transformation to maintain variation across certain meaningful input parameters in its output while discarding variation across other input parameters. This boils down to a selective input-output decorrelation that allows the system to represent only those aspects of the stimulus that are behaviorally relevant to the organism. The grasshopper auditory system has to deal with a number of sources of non-informative song variation. For instance, the temporal structure of the song pattern warps with temperature~(\bcite{skovmand1983song}). This also affects certain structural parameters that are essential for song recognition, mainly the duration of syllables and pauses. The auditory system can compensate for this variation by reading out relative temporal relationships rather than absolute time intervals~(\bcite{creutzig2009timescale}; \bcite{creutzig2010timescale}). The ratio of syllable duration to pause duration is relatively constant across temperatures and has been shown to be suitable for song recognition~(\bcite{helversen1972gesang}), so that there is likely no need to retain any information about the absolute duration of syllables and pauses. The situation is more complex for variations in song intensity. Song intensity at the receiver's position depends mostly on the distance to the sender and is hence not a reliable cue to infer species identity. The auditory system should therefore be invariant to intensity variations to recognize conspecific songs regardless of sender distance. However, song intensity --- specifically, the interaural intensity difference --- is also required for directional hearing, which is essential for phonotaxis~(\bcite{helversen1988interaural}). Conflicts between song recognition and directional hearing are avoided in the auditory system by distributing both functions across two parallel pathways~(\bcite{helversen1984parallel}; \bcite{ronacher1986routes}). This is the main reason why our model pathway is focused entirely on song recognition and has no capacity for directional hearing, no matter how relevant it may be to the grasshopper. Furthermore, "invariance to variations in song intensity" does not do justice to the full extent of the problem. Intensity is a function of song amplitude within a certain time frame. It can refer to the individual syllables and pauses of the song pattern as well as the entire song --- the former is relevant for song recognition, while the latter is not. Intensity invariance in the current context can therefore be described as time scale-selective sensitivity to the faster amplitude dynamics of the song pattern and simultaneous insensitivity to slower, more sustained amplitude dynamics. In the model pathway, this time scale selectivity is reflected by the cutoff frequency $\fc$ of the highpass filter that underlies the adaptation of $\adapt(t)$: Most $\fc$ are effective in removing the local offset of $\db(t)$ and render $\adapt(t)$ intensity-invariant, but only sufficiently low $\fc$ will leave the relevant amplitude dynamics of the song pattern intact. \subsection{Intensity invariance versus SNR} Each processing step along the model pathway is a transformation between input representation and output representation. The intensity of the input is characterized by scale $\sca$. The intensity of the output is characterized by an appropriate intensity measure. If the transformation renders the output more intensity-invariant, then the intensity measure will saturate for sufficiently large $\sca$, which caps the output SNR to a constant value across these $\sca$. Otherwise, the intensity measure and hence the output SNR will increase monotonically with $\sca$. The trade-off between intensity invariance and SNR refers to the principle that a transformation can either improve intensity invariance or maintain SNR --- it cannot do both at the same time. This principle is presumably not specific to the two mechanisms along the model pathway but rather a general property of transformations that equalize between different input intensities. Logarithmic compression and adaptation by highpass filtering is capable of equalizing a wide range of $\sca$. In the absence of noise component $\noc(t)$, output $\adapt(t)$ is a perfectly intensity-invariant representation of song component $\soc(t)$ across all $\sca>0$. However, the presence of $\noc(t)$ limits the effectiveness of this mechanism to sufficiently large $\sca$. This means that intensity invariance and SNR interact at the input level, as well. Specifically, the saturation point of $\adapt(t)$ is determined by the input SNR of $\env(t)$, which in turn depends on the initial SNR of the sound signal $\raw(t)$. This initial SNR is presumably improved by the bandpass filtering of $\raw(t)$ into $\filt(t)$ at the tympanal membrane, which attenuates frequencies outside the relevant range of grasshopper songs. The SNR is then further improved by the rectification and lowpass filtering of $\filt(t)$ into $\env(t)$. This improvement depends on the cutoff frequency $\fc$ of the lowpass filter --- the lower $\fc$, the higher the SNR of $\env(t)$ at a given $\sca$. However, $\fc$ must not be too low to avoid the attenuation of relevant amplitude dynamics of the song pattern. The saturation level of $\adapt$, unlike its saturation point, is independent of the SNR of $\env(t)$ because the influence of $\noc(t)$ is negligible for sufficiently large $\sca$. The output SNR of $\adapt(t)$ saturates at a comparably low value of around 10. This might in parts be a consequence of the logarithm, which compresses different higher intensities but also amplifies lower intensities, including the noise floor. Both the saturation level and the saturation point of $\adapt(t)$ vary between different species and individual songs. These differences are likely rooted in the way in which logarithmic compression acts on the specific distribution of $\env(t)$, which is determined by $\fc$ as well as the temporal structure and frequency spectrum of the rectified $\filt(t)$. Thresholding and temporal averaging renders feature $f_i(t)$ intensity-invariant for sufficiently large $\sca$. The trade-off between intensity invariance and SNR is mediated by threshold value $\thr$. A lower $\thr$ ($\thr\to0$) improves intensity invariance by shifting the saturation point towards lower $\sca$ but also decreases the SNR of $f_i(t)$. The saturation level of $f_i(t)$ is independent of $\thr$ as long as the intensity invariance by the previous mechanism is neglected. The SNR of $f_i(t)$ is therefore determined solely by the pure-noise response of $f_i(t)$. The distribution $\pci$ of the pure-noise kernel response $c_i(t)$ is largely a normal distribution with mean $\mu\approx0$ for all kernels $k_i(t)$. The value of the pure-noise $f_i(t)$ is hence 0.5 for $\thr=0$ and decreases for higher $\thr$. If $\thr$ is set above the maximum of $c_i(t)$, the pure-noise feature value is 0, which results in an "unlimited" SNR of $f_i(t)$. In this case, any non-zero feature value that is sustained for a sufficient duration could serve as indicator for the presence of $\soc(t)$, although at the cost of a higher saturation point. The maximum of the pure-noise $c_i(t)$ is assumed to be very small due to the various SNR improvements along the pathway, so that the required increase in $\thr$ and hence the saturation point of $f_i(t)$ is not expected to be substantial. However, exploiting the capacity of $f_i(t)$ for arbitrarily high SNR would certainly require a fine evolutionary tuning of $\thr$ to the properties of both the species-specific song and the natural noise in a certain habitat. \newpage \subsection{Intensity invariance versus intensity invariance} Two consecutive mechanisms of intensity invariance do not necessarily add up to a stronger overall intensity invariance. If the first mechanism results in a lower saturation point than the second mechanism by itself, the saturation point of feature $f_i(t)$ will be determined solely by the first mechanism. In this case, the saturation level of $f_i(t)$ will conform to the intensity that $f_i(t)$ can reach for the given saturation point rather than the intrinsic saturation level of $f_i(t)$. Conversely, if the second mechanism results in a lower saturation point than the first mechanism, both the saturation point and the saturation level of $f_i(t)$ will be determined by the second mechanism. The saturation points of $f_i(t)$ across the set are distributed over a much wider range than those of the preceeding kernel responses $c_i(t)$, which suggests that the interaction between the two mechanisms is specific to individual kernels $k_i(t)$. A number of $f_i(t)$ achieve a lower saturation point than the respective $c_i(t)$, while some $f_i(t)$ exhibit similar or only marginally lower saturation points. This raises the question whether two consecutive mechanisms of intensity invariance are actually beneficial for the overall system. From a purely functional perspective, the answer could be that logarithmic compression and adaptation is a necessary preprocessing step towards a robust feature representation, even if thresholding and temporal averaging alone would be sufficient to render $f_i(t)$ intensity-invariant. This preprocessing likely improves the temporal regularity of the song pattern in $\adapt(t)$ and $c_i(t)$, which is required for constant $f_i(t)$ across the duration of a song~(Section\,\ref{sec:constant_feat}). It also ensures consistency between the distribution $\pci$ of $c_i(t)$ across songs of different intensity, which is essential for the generation of consistent species-specific $f_i(t)$ under a static $\thr$. From a physiological perspective, the answer is likely that neurons possess only a limited firing rate for encoding stimulus intensities that can range over several orders of magnitude. Sigmoidal tuning curves over logarithmically compressed stimulus intensities are a common property of sensory neurons across various modalities~(SOURCE?), and neurons of the grasshopper auditory system are no exception~(\bcite{suga1960peripheral}; \bcite{gollisch2002energy}). \subsection{Implications for behavior in a natural acoustic environment} Most grasshoppers live in environments that are communally inhabited by numerous individuals from multiple species. Their acoustic environment is characterized by noise from various sources --- abiotic ones like wind and water, but also the songs of both hetero- and conspecifics. This limits the SNR that each individual can achieve for its own song, and hence the effectiveness of the intensity-invariant processing in the auditory system. Producing higher song intensities is not a viable solution to this problem, because these also contribute to the overall noise floor. A possible behavioral solution could be to produce songs in a "turn-taking" manner to avoid the temporal superposition of multiple songs into overly intense signals. This would also prevent the mutual distortion of the respective song pattern. Another solution could be to spatially separate from other nearby grasshoppers to spread the potential noise sources over a larger area. However, according to our analysis based on field recordings as well as previous work on the topic~(\bcite{lang2000acoustic}), reliable song recognition is limited to little more than 1\,m from the sender, so that a grasshopper also cannot afford to stay too far away from its conspecifics. A better solution may hence be to collectively produce songs at lower-than-possible intensities, which would reduce the overall noise floor for all nearby individuals. Importantly, the limitation of intensity invariance by SNR likely applies to all grasshoppers regardless of species, so that the behavioral strategies could be shared among the species that coexist in a given habitat. % Because the presumed restriction of song recognition % by means of the noise floor applies to all grasshoppers in a certain area, % these strategies may not be specific to some of the species at this location. % Instead, they must be shared by all grasshopper species that coexist within a % portion of a given habitat, which would provide an important implication for % the evolution of grasshopper songs in communities of multiple species. %%% RELICS OF INTRODUCTION %%% % - Nonlinear operations can be used to detach representations from graded physical % stimulus (to fasciliate categorical behavioral decision-making?):\\ % 1) Capture sufficiently precise amplitude information: $\env(t)$, $\adapt(t)$\\ % $\rightarrow$ Closely following the AM of the acoustic stimulus\\ % 2) Quantify relevant stimulus properties on a graded scale: $c_i(t)$\\ % $\rightarrow$ More decorrelated representation, compared to prior stages\\ % 3) Nonlinearity: Distinguish between "relevant vs irrelevant" values: $b_i(t)$\\ % $\rightarrow$ Trading a graded scale for two or more categorical states\\ % 4) Represent stimulus properties under relevance constraint: $f_i(t)$\\ % $\rightarrow$ Graded again but highly decorrelated from the acoustic stimulus\\ % 5) Categorical behavioral decision-making requires further nonlinearities\\ % $\rightarrow$ Parameters of a behavioral response may be graded (e.g. approach speed), % initiation of one behavior over another is categorical (e.g. approach/stay) % Multi-species, multi-individual communally inhabited environments\\ % - Temporal overlap: Simultaneous singing across individuals/species common\\ % - Frequency overlap: Little speciation into frequency bands (likely unused)\\ % - "Biotic noise": Hetero-/conspecifics ("Another one's songs are my noise")\\ % - "Abiotic noise": Wind, water, vegetation, anthropogenic\\ % - Effects of habitat structure on sound propagation (landscape - soundscape)\\ % $\rightarrow$ Sensory constraints imposed by the (acoustic) environment % Cluster of auditory challenges (interlocking constraints $\rightarrow$ tight coupling):\\ % From continuous acoustic input, generate neuronal representations that...\\ % 1)...allow for the separation of relevant (song) events from ambient noise floor\\ % 2)...compensate for behaviorally non-informative song variability (invariances)\\ % 3)...carry sufficient information to characterize different song patterns, % recognize the ones produced by conspecifics, and make appropriate behavioral % decisions based on context (sender identity, song type, mate/rival quality) % How can a human observer conceive a grasshopper's auditory percepts?\\ % - How to investigate the workings of the auditory pathway as a whole?\\ % - How to systematically test effects and interactions of processing parameters?\\ % - How to integrate the available knowledge on anatomy, physiology, ethology?\\ % $\rightarrow$ Abstract, simplify, formalize $\rightarrow$ Functional model framework \newpage \section{Appendix} % Not sure if we really need this one. Might raise more questions than it % provides answers. The noise component is not stable throughout nonlinear % transformations, that is all the reader needs to know, i believe. \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_noise_env_sd_conversion_appendix.pdf} \caption{\textbf{Conversion of the noise component by envelope extraction.} Standard deviation $\sigma_{\eta}$ of noise component $\noc(t)$ within the signal envelope $\env(t)$ over scale $\sca$. Based on input $\raw(t)$ with $\sigma_{\eta}=1$ (corresponding to the analysis underlying Fig.\,\ref{fig:rect-lp}), using 100 random realizations of $\noc(t)$.} \label{fig:app_env-sd} \end{figure}% Referenced. \FloatBarrier \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_invariance_rect-lp_appendix.pdf} \caption{\textbf{Species-specific data underlying Fig.\,\ref{fig:rect-lp}e.} Ratio of the standard deviation $\sigma_{\text{env}}$ to the pure-noise reference $\sigma_{\eta}$ of the signal envelope $\env(t)$ over scale $\sca$ for different cutoff frequencies $\fc$ of the lowpass filter extracting $\env(t)$. Solid lines and shaded areas indicate mean $\pm$ standard deviation across songs per recording. Dashed lines indicate mean across recordings (shown in Fig.\,\ref{fig:rect-lp}e).} \label{fig:app_rect-lp} \end{figure}% Referenced. \FloatBarrier \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_invariance_log-hp_appendix.pdf} \caption{\textbf{Species-specific data underlying Fig.\,\ref{fig:log-hp}e.} Ratio of the standard deviation $\sigma_{\text{adapt}}$ to the pure-noise reference $\sigma_{\eta}$ of the intensity-adapted envelope $\adapt(t)$ over scale $\sca$. Solid lines and shaded areas indicate mean $\pm$ standard deviation across songs per recording. Dashed lines indicate mean across recordings (shown in Fig.\,\ref{fig:log-hp}e).} \label{fig:app_log-hp_curves} \end{figure}% Referenced. \FloatBarrier \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_saturation_log-hp_appendix.pdf} \caption{\textbf{Species-specific saturation points underlying Fig.\,\ref{fig:log-hp}e.} Distribution of saturation points ($95\,\%$ curve span) of ratio $\sigma_{\text{adapt}} / \sigma_{\eta}$ of the intensity-adapted envelope $\adapt(t)$ over scale $\sca$ across all available songs. Dots indicate the saturation point of the mean curve across songs and recordings (shown in Fig.\,\ref{fig:log-hp}e, see also appendix Fig.\,\ref{fig:app_log-hp_curves}).} \label{fig:app_log-hp_saturation} \end{figure}% Referenced. \FloatBarrier \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_invariance_thresh-lp_pure_appendix.pdf} \caption{\textbf{Species-specific data underlying Fig.\,\ref{fig:thresh-lp_species}bd.} Average value $\muf$ of each of the three features $f_i(t)$ over scale $\sca$ in the noiseless case. Solid lines and shaded areas indicate mean $\pm$ standard deviation across songs per recording. Dashed lines indicate mean across recordings (shown in Fig.\,\ref{fig:thresh-lp_species}bd).} \label{fig:app_thresh-lp_pure} \end{figure}% Referenced. \FloatBarrier \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_invariance_thresh-lp_noise_appendix.pdf} \caption{\textbf{Species-specific data underlying Fig.\,\ref{fig:thresh-lp_species}ce.} Average value $\muf$ of each of the three features $f_i(t)$ over scale $\sca$ in the noisy case. Solid lines and shaded areas indicate mean $\pm$ standard deviation across songs per recording. Dashed lines indicate mean across recordings (shown in Fig.\,\ref{fig:thresh-lp_species}ce).} \label{fig:app_thresh-lp_noise} \end{figure}% Referenced. \FloatBarrier \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_kernel_sd_perc_thresh_lp_appendix.pdf} \caption{\textbf{Relation between threshold value and pure-noise feature value for Fig.\,\ref{fig:thresh-lp_single} and Fig.\,\ref{fig:thresh-lp_species}.} Proportion of pure-noise kernel response $c_i(t)$ that exceeds threshold value $\thr$ --- which determines the average value $\muf$ of feature $f_i(t)$ --- over $\thr$ in multiples of standard deviation $\sigma_{c_i}$. Corresponds to a "reverse" cumulative distribution function of $c_i(t)$. Black solid lines indicate rCDF per kernel $k_i(t)$. Red dashed line indicates rCDF for a normal distribution with $\mu=0$ and $\sigma=1$. } \label{fig:app_thresh-lp_kern-sd} \end{figure}% Referenced. \FloatBarrier \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_kernel_sd_perc_full_appendix.pdf} \caption{\textbf{Relation between threshold value and pure-noise feature value for Fig.\,\ref{fig:pipeline_full}.} Proportion of pure-noise kernel response $c_i(t)$ that exceeds threshold value $\thr$ --- which determines the average value $\muf$ of feature $f_i(t)$ --- over $\thr$ in multiples of standard deviation $\sigma_{c_i}$. Corresponds to a "reverse" cumulative distribution function of $c_i(t)$. Black solid lines indicate rCDF per kernel $k_i(t)$. Red dashed line indicates rCDF for a normal distribution with $\mu=0$ and $\sigma=1$. } \label{fig:app_full_kern-sd} \end{figure}% Referenced. \FloatBarrier \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_kernel_sd_perc_short_appendix.pdf} \caption{\textbf{Relation between threshold value and pure-noise feature value for Fig.\,\ref{fig:pipeline_short}.} Proportion of pure-noise kernel response $c_i(t)$ that exceeds threshold value $\thr$ --- which determines the average value $\muf$ of feature $f_i(t)$ --- over $\thr$ in multiples of standard deviation $\sigma_{c_i}$. Corresponds to a "reverse" cumulative distribution function of $c_i(t)$. Black solid lines indicate rCDF per kernel $k_i(t)$. Red dashed line indicates rCDF for a normal distribution with $\mu=0$ and $\sigma=1$. } \label{fig:app_short_kern-sd} \end{figure}% Referenced. \FloatBarrier \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_kernel_sd_perc_field_appendix.pdf} \caption{\textbf{Relation between threshold value and pure-noise feature value for Fig.\,\ref{fig:pipeline_field}.} Proportion of pure-noise kernel response $c_i(t)$ that exceeds threshold value $\thr$ --- which determines the average value $\muf$ of feature $f_i(t)$ --- over $\thr$ in multiples of standard deviation $\sigma_{c_i}$. Corresponds to a "reverse" cumulative distribution function of $c_i(t)$. Black solid lines indicate rCDF per kernel $k_i(t)$. Red dashed line indicates rCDF for a normal distribution with $\mu=0$ and $\sigma=1$. } \label{fig:app_field_kern-sd} \end{figure}% Referenced. \FloatBarrier \begin{figure}[!ht] \centering \includegraphics[width=\textwidth]{figures/fig_invariance_cross_species_thresh_appendix.pdf} \caption{\textbf{Threshold-dependent intensity invariance of species-specific feature sets.} Same processing as in Fig.\,\ref{fig:pipeline_full}, using different kernel-specific threshold values $\thr$ (multiples of pure-noise standard deviation $\sigma_{\eta_i}$ of $c_i(t)$ for $\sca=0$. See also appendix Fig.\,\ref{fig:app_full_kern-sd}). Average value $\muf$ of each feature $f_i(t)$ over $\sca$. } \label{fig:app_cross_species_thresh} \end{figure}% Reference this one! \FloatBarrier \end{document}