Files
paper_2025/main.tex

2052 lines
120 KiB
TeX

\documentclass[a4paper, 12pt]{article}
\usepackage[left=2cm,right=2cm,top=2cm,bottom=2cm,includeheadfoot]{geometry}
% \usepackage[onehalfspacing]{setspace}
\usepackage{graphicx}
\usepackage{svg}
\usepackage{import}
\usepackage{float}
\usepackage{placeins}
\usepackage{parskip}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{subcaption}
\usepackage[labelfont=bf, textfont=small]{caption}
\usepackage[german,english]{babel}
\addto\captionsenglish{\renewcommand{\figurename}{Fig.}}
\addto\captionsenglish{\renewcommand{\tablename}{Tab.}}
\usepackage[separate-uncertainty=true, locale=DE]{siunitx}
\sisetup{output-exponent-marker=\ensuremath{\mathrm{e}}}
% \usepackage[capitalize]{cleveref}
% \crefname{figure}{Fig.}{Figs.}
% \crefname{equation}{Eq.}{Eqs.}
% \creflabelformat{equation}{#2#1#3}
\usepackage[
backend=biber,
style=authoryear,
pluralothers=true,
maxcitenames=1,
mincitenames=1
]{biblatex}
\addbibresource{cite.bib}
%\bibdata
%\bibstyle
%\citation
\title{Emergent intensity invariance vs. signal-to-noise ratio at three consecutive processing stages along the grasshopper song recognition pathway}
\author{Jona Hartling, Jan Benda}
\date{}
\begin{document}
\maketitle{}
% Text references and citations:
\newcommand{\bcite}[1]{\mbox{\cite{#1}}}
% \newcommand{\fref}[1]{\mbox{\cref{#1}}}
% \newcommand{\fref}[1]{\mbox{Fig.\,\ref{#1}}}
% \newcommand{\eref}[1]{\mbox{\cref{#1}}}
% \newcommand{\eref}[1]{\mbox{Eq.\,\ref{#1}}}
% Subplot lettering:
\newcommand{\figa}{\textbf{a}}
\newcommand{\figb}{\textbf{b}}
\newcommand{\figc}{\textbf{c}}
\newcommand{\figd}{\textbf{d}}
\newcommand{\fige}{\textbf{e}}
% Math shorthands - Standard symbols:
\newcommand{\dec}{\log_{10}} % Logarithm base 10
\newcommand{\infint}{\int_{-\infty}^{+\infty}} % Indefinite integral
% Math shorthands - Spectral filtering:
\newcommand{\bp}{h_{\text{BP}}(t)} % Bandpass filter function
\newcommand{\lp}{h_{\text{LP}}(t)} % Lowpass filter function
\newcommand{\hp}{h_{\text{HP}}(t)} % Highpass filter function
\newcommand{\fc}{f_{\text{cut}}} % Filter cutoff frequency
\newcommand{\tlp}{T_{\text{LP}}} % Lowpass filter averaging interval
\newcommand{\thp}{T_{\text{HP}}} % Highpass filter adaptation interval
% Math shorthands - Early representations:
\newcommand{\raw}{x_{\text{raw}}} % Placeholder input signal
\newcommand{\filt}{x_{\text{filt}}} % Bandpass filtered signal
\newcommand{\env}{x_{\text{env}}} % Signal envelope
\newcommand{\db}{x_{\text{log}}} % Logarithmically scaled signal
\newcommand{\dbref}{x_{\text{ref}}} % Decibel reference intensity
\newcommand{\adapt}{x_{\text{adapt}}} % Adapted signal
% Math shorthands - Kernel parameters:
\newcommand{\kw}{\sigma} % Unspecific Gabor kernel width
\newcommand{\kf}{\omega} % Unspecific Gabor kernel frequency
\newcommand{\kp}{\phi} % Unspecific Gabor kernel phase
\newcommand{\kn}{n} % Unspecific Gabor kernel lobe number
\newcommand{\kwi}{\kw_i} % Specific Gabor kernel width
\newcommand{\kfi}{\kf_i} % Specific Gabor kernel frequency
\newcommand{\kpi}{\kp_i} % Specific Gabor kernel phase
\newcommand{\kni}{\kn_i} % Specific Gabor kernel lobe number
% Math shorthands - Auxiliary kernel parameters:
\newcommand{\fdrm}{\text{FDRM}} % Gaussian full duration relative to maximum
\newcommand{\rh}{h_{\text{rel}}} % Relative Gaussian height for FDRM calculation
% Math shorthands - Thresholding nonlinearity:
\newcommand{\thr}{\Theta_i} % Step function threshold value
\newcommand{\nl}{H(c_i\,-\,\thr)} % Shifted Heaviside step function
% Math shorthands - Intensity invariance analysis:
\newcommand{\soc}{s} % Song component of synthetic mixture
\newcommand{\noc}{\eta} % Noise component of synthetic mixture
\newcommand{\sca}{\alpha} % Multiplicative scale of song component
\newcommand{\xvar}{\sigma_{x}^{2}} % Variance of synthetic mixture
\newcommand{\svar}{\sigma_{\text{s}}^{2}} % Song component variance
\newcommand{\nvar}{\sigma_{\eta}^{2}} % Noise component variance
\newcommand{\xsig}{\sigma_x} % Standard deviation of synthetic mixture
\newcommand{\ssig}{\sigma_{\text{s}}} % Song component standard deviation
\newcommand{\nsig}{\sigma_{\eta}} % Noise component standard deviation
\newcommand{\pc}{p(c,\,T)} % Probability density (general interval)
\newcommand{\pclp}{p(c,\,\tlp)} % Probability density (lowpass interval)
\newcommand{\pci}{p(c_i,\,\tlp)} % Kernel-specific probability density (lowpass interval)
\newcommand{\muf}{\mu_{f_i}} % Average feature value
\section{Introduction}
% % Drosophila/visual/article:
% \bcite{ketkar2023multifaceted}
% % Drosophila/auditory/article:
% \bcite{ozeri2018fast}
% % Primate/auditory/review:
% \bcite{barbour2011intensity}
% % Cricket/auditory/article:
% \bcite{benda2008spike}
% % Locust/auditory/article:
% \bcite{clemens2010intensity}
% % Rodent/olfactory/article:
% \bcite{bolding2018recurrent}
% Introduction to intensity invariance:
Intensity invariance is a fundamental property of sensory systems across
modalities and species, from fruit flies~(\bcite{ozeri2018fast};
\bcite{ketkar2023multifaceted}) over crickets~(\bcite{benda2008spike}) and
grasshoppers~(\bcite{clemens2010intensity}) to
rodents~(\bcite{bolding2018recurrent}) and
primates~(\bcite{barbour2011intensity}). It allows for the robust recognition
of behaviorally relevant stimuli despite variations in stimulus intensity.
However, the computational mechanisms underlying intensity invariance are often
difficult to disentangle. Here, we use a physiologically inspired functional
model of the grasshopper song recognition pathway to investigate the emergence
of intensity invariance throughout the auditory processing stream.
% Why the grasshopper auditory system?
% Why focus on song recognition among other auditory functions?
The auditory system of grasshoppers~(\textit{Acrididae}) has been studied
extensively over the years. Grasshoppers rely on their sense of hearing for
intraspecific communication --- including mate
attraction~(\bcite{helversen1972gesang}) and
evaluation~(\bcite{stange2012grasshopper}), sender
localization~(\bcite{helversen1988interaural}), courtship
display~(\bcite{elsner1968neuromuskularen}), and rival
deterrence~(\bcite{greenfield1993acoustic}) --- and have evolved a variety of
acoustic signals for different behavioral
contexts~(\bcite{otte1970comparative}). The most conspicuous acoustic signals
of grasshoppers are their species-specific calling songs, which broadcast the
presence of the singing individual to potential mates within range. These songs
are usually more characteristic of a species than morphological
traits~(\bcite{tishechkin2016acoustic}; \bcite{tarasova2021eurasius}), which
can vary greatly within species~(\bcite{rowell1972variable};
\bcite{kohler2017morphological}). The reliance on songs to mediate reproduction
represents a strong evolutionary driving force that resulted in a massive
species diversification~(\bcite{vedenina2011speciation};
\bcite{sevastianov2023evolution}), with over 6800 recognized species in the
\textit{Acrididae} family~(\bcite{cigliano2024orthoptera}).
% What are the signals that the auditory system is supposed to recognize?
Grasshopper songs are amplitude-modulated broad-band acoustic signals. They
consist of a series of noisy syllables and relatively quiet pauses, which form
a characteristic repetitive pattern~(\bcite{helversen1977stridulatory};
\bcite{stumpner1994song}). Song recognition depends on certain structural
parameters of this pattern --- such as the duration of syllables and
pauses~(\bcite{helversen1972gesang}), the slope of pulse
onsets~(\bcite{helversen1993absolute}), and the accentuation of syllable onsets
relative to the preceeding pause~(\bcite{balakrishnan2001song};
\bcite{helversen2004acoustic}) --- which are sufficiently conveyed by the
amplitude modulation of the song alone~(\bcite{helversen1997recognition}).
% Why is intensity invariance important for song recognition?
Grasshopper songs, like all acoustic signals, are subject to sound attenuation,
which depends on the distance from the sound source, the frequency content of
the signal, and the vegetation of the habitat~(\bcite{michelsen1978sound}).
Sound attenuation has two major consequences for song recognition. First, the
amplitude dynamics of the song pattern degrade with increasing distance to the
sender, which limits the effective communication range of grasshoppers
to~\mbox{1\,-\,2\,m} in their typical grassland
habitats~(\bcite{lang2000acoustic}). Second, the intensity of a song at the
receiver's position varies with the position of the sender, which should
ideally not affect song recognition. The auditory system thus needs to achieve
a certain degree of intensity invariance --- a time scale-selective sensitivity
to faster amplitude dynamics and simultaneous insensitivity to more sustained
amplitude dynamics. Intensity invariance is commonly associated with neural
adaptation~(\bcite{benda2008spike}; \bcite{barbour2011intensity};
\bcite{ozeri2018fast}; more general:~\bcite{benda2021neural}). Different neuron
types in the grasshopper auditory system exhibit spike-frequency adaptation in
response to sustained stimulation~(\bcite{romer1976informationsverarbeitung};
\bcite{gollisch2004input}; \bcite{hildebrandt2009origin};
\bcite{clemens2010intensity}; \bcite{fisch2012channel}). Accordingly, intensity
invariance is not the result of a single processing step but rather a gradual
process, in which different neuronal populations contribute to varying
degrees~(\bcite{clemens2010intensity}) and by different
mechanisms~(\bcite{hildebrandt2009origin}).
% How did we expand on the previous framework (feat. Clemens et al.)?
In the current study, we leverage functional modelling to trace the emergence
of intensity invariance through individual processing steps of the grasshopper
song recognition pathway. The model pathway we propose here is based on a
previous functional model framework for song recognition in both
crickets~(\bcite{clemens2013computational}; \bcite{hennig2014time}) and
grasshoppers~(\bcite{clemens2013feature}; review on
both:~\bcite{ronacher2015computational}). The exisiting framework relies on
pulse trains as input signals, which were designed to capture the essential
structural properties of natural song envelopes~(\bcite{clemens2013feature}).
It includes feature extraction by a bank of linear-nonlinear feature detectors,
evidence accumulation by temporal averaging of each feature, and categorical
decision making by a weighted linear combination of feature values. We adopted
the general structure of the existing framework and extended it by a
physiologically plausible preprocessing stage --- including spectral filtering,
envelope extraction, logarithmic compression, and intensity adaptation ---
which allows the model to operate on unmodified recordings of natural
grasshopper songs. The resulting model pathway thus covers the entire auditory
processing stream from the initial reception of airborne sound waves to the
generation of a high-dimensional feature representation that allows for the
categorical recognition of conspecific songs. It incorporates anatomical,
physiological, and ethological evidence from several decades of research on the
grasshopper auditory system. In the following, we provide a side-by-side
account of the known physiological processing steps along the song recognition
pathway and their functional approximations in the model pathway. We then
elaborate on the computational mechanisms that contribute to the emergence of
intensity-invariant song representations, the interaction between these
mechanisms, the overall capacity for intensity invariance in the system, and
the ethological implications of our findings.
\newpage
\section{Methods}
% This maybe does not quite fit here, but it is the most general part of the
% methods and applies throughout the whole section, so I put it here for now.
All modeling, data analysis, and data visualization was performed in
Python~3.12.3 except for the pathway overview~(Fig.\,\ref{fig:pathway}), which
was assembled in Inkscape~1.2. The code base for the model pathway is available
as the \textit{thunderhopper} package, version 1.0, on PyPi. Any audio data was
inspected and edited with the help of the \textit{audian} package, version 2.4,
on PyPi.
\subsection{Functional model of the grasshopper song recognition pathway}
The anatomical organisation of the grasshopper song recognition pathway can be
outlined as a feed-forward network of three consecutive neuronal
populations~(Fig.\,\ref{fig:pathway}a-c): Peripheral auditory receptor neurons,
whose axons enter the ventral nerve cord (VNC) at the level of the metathoracic
ganglion; local interneurons that remain exclusively within the thoracic region
of the VNC; and ascending neurons projecting from the thoracic region towards
the supraesophageal ganglion (SEG), or central
brain~(\bcite{rehbein1974structure}; \bcite{rehbein1976auditory};
\bcite{eichendorf1980projections}). The input to the network originates at the
tympanal membrane, which acts as acoustic receiver and is coupled to the
dendritic endings of the receptor neurons~(\bcite{gray1960fine}). The outputs
from the network converge in the SEG, which presumably harbors the neuronal
substrate for conspecific song recognition and response
initiation~(\bcite{ronacher1986routes}; \bcite{bauer1987separate};
\bcite{bhavsar2017brain}).
Around 15 to 20 ascending neurons have been identified in the grasshopper
auditory system~(\bcite{stumpner1991auditory}), whose functional
characteristics are conserved even between species that are not closely
related~(\bcite{neuhofer2008evolutionarily}). The population of ascending
neurons possesses a diverse range of response properties that contrasts with
the rather homogeneous responses of receptor neurons and local
interneurons~(\bcite{clemens2011efficient}), which suggests a transition from a
uniform population-wide processing stream into several parallel branches.
Accordingly, the model pathway is divided into two distinct
stages~(Fig.\,\ref{fig:pathway}d): The preprocessing stage incorporates the
processing steps at the levels of the tympanal membrane, the receptor neurons,
and the local interneurons; and operates on one-dimensional signal
representations~(Fig.\,\ref{fig:stages_pre}). The feature extraction stage
corresponds to the processing within the ascending neurons and further
downstream towards the SEG; and operates on high-dimensional signal
representations~(Fig.\,\ref{fig:stages_feat}). The details of each
physiological processing step and its functional approximation are described in
the following sections.
Around 15 to 20 ascending neurons have been identified in the grasshopper
auditory system~(\bcite{stumpner1991auditory}), whose functional
characteristics are conserved even between species that are not closely
related~(\bcite{neuhofer2008evolutionarily}). The population of ascending
neurons possesses a diverse range of response properties that contrasts with
the rather homogeneous responses of receptor neurons and local
interneurons~(\bcite{clemens2011efficient}), which suggests a transition from a
uniform population-wide processing stream into several parallel branches.
Accordingly, the model pathway is divided into two distinct
stages~(Fig.\,\ref{fig:pathway}d): The preprocessing stage incorporates the
processing steps at the levels of the tympanal membrane, the receptor neurons,
and the local interneurons; and operates on one-dimensional signal
representations~(Fig.\,\ref{fig:stages_pre}). The feature extraction stage
corresponds to the processing within the ascending neurons and further
downstream towards the SEG; and operates on high-dimensional signal
representations~(Fig.\,\ref{fig:stages_feat}). The details of each
physiological processing step and its functional approximation are described in
the following sections.
\begin{figure}[!ht]
\centering
\includegraphics[width=\textwidth]{figures/fig_auditory_pathway.pdf}
\caption{\textbf{Schematic organisation of the grasshopper song recognition
pathway and structure of the functional model pathway.}
\textbf{a}:~Simplified course of the pathway in the
grasshopper, from the tympanal membrane over receptor
neurons, local interneurons, and ascending neurons further
towards the supraesophageal ganglion.
\textbf{b}:~Schematic of synaptic connections between
the three neuronal populations within the metathoracic
ganglion.
\textbf{c}:~Network representation of neuronal connectivity.
\textbf{d}:~Flow diagram of consecutive signal
representations~(boxes) and transformations~(arrows) along
the model pathway. All representations are time-varying.
1st half: Preprocessing stage~(one-dimensional
representations). 2nd half: Feature extraction
stage~(high-dimensional representations). }
\label{fig:pathway}
\end{figure}
\subsubsection{Population-driven signal preprocessing}
Grasshoppers receive airborne sound waves by a tympanal organ at each side of
the body. The tympanal membrane acts as a mechanical resonance filter for
sound-induced vibrations~(\bcite{windmill2008time}; \bcite{malkin2014energy}).
Vibrations that fall within specific frequency bands are focused on different
membrane areas, while others are attenuated. This processing step can be
approximated by an initial bandpass filter~(Fig.\,\ref{fig:stages_pre}a)
applied to the acoustic input signal $\raw(t)$:
\begin{equation}
\filt(t)\,=\,\raw(t)\,*\,\bp, \qquad \fc\,=\,5\,\text{kHz},\,30\,\text{kHz}
\label{eq:bandpass}
\end{equation}
The receptor neurons transduce the vibrations of the tympanal membrane into
sequences of action potentials. They thereby encode the amplitude modulation,
or envelope, of the signal~(\bcite{machens2001discrimination}), which likely
involves a rectifying nonlinearity~(\bcite{machens2001representation}). The
extraction of the signal envelope~(Fig.\,\ref{fig:stages_pre}b) can be modelled
as full-wave rectification followed by lowpass filtering of the tympanal signal
$\filt(t)$:
\begin{equation}
\env(t)\,=\,|\filt(t)|\,*\,\lp, \qquad \fc\,=\,250\,\text{Hz}
\label{eq:env}
\end{equation}
Furthermore, the receptors exhibit a sigmoidal response curve over
logarithmically compressed stimulus intensities~(\bcite{suga1960peripheral};
\bcite{gollisch2002energy}). In the model pathway, logarithmic
compression~(Fig.\,\ref{fig:stages_pre}c) is achieved by conversion to decibel
scale
\begin{equation}
\db(t)\,=\,20\,\cdot\,\dec \frac{\env(t)}{\dbref}, \qquad \dbref\,=\,1
\label{eq:log}
\end{equation}
relative to the common reference intensity $\dbref$. Both the receptor
neurons~(\bcite{romer1976informationsverarbeitung}; \bcite{gollisch2004input};
\bcite{fisch2012channel}) and, on a larger scale, the subsequent local
interneurons~(\bcite{hildebrandt2009origin}; \bcite{clemens2010intensity})
adapt their firing rates in response to sustained stimulus intensities, which
allows for the robust encoding of faster amplitude modulations against a slowly
changing overall baseline intensity. Functionally, the adaptation mechanism
resembles a highpass filter~(Fig.\,\ref{fig:stages_pre}d) over the
logarithmically compressed envelope $\db(t)$:
\begin{equation}
\adapt(t)\,=\,\db(t)\,*\,\hp, \qquad \fc\,=\,10\,\text{Hz}
\label{eq:highpass}
\end{equation}
This processing step concludes the preprocessing stage of the model pathway.
The resulting intensity-adapted envelope $\adapt(t)$ is then passed on from the
local interneurons to the ascending neurons, where it serves as the basis for
the following feature extraction stage.
\begin{figure}[!ht]
\centering
\includegraphics[width=\textwidth]{figures/fig_pre_stages.pdf}
\caption{\textbf{Song representations during the preprocessing stage.}
Example song of \textit{O. rufipes}.
\textbf{a}:~Bandpass filtered tympanal signal $\filt(t)$.
\textbf{b}:~Signal envelope $\env(t)$.
\textbf{c}:~Logarithmically compressed envelope $\db(t)$.
\textbf{d}:~Intensity-adapted envelope $\adapt(t)$.
}
\label{fig:stages_pre}
\end{figure}
\FloatBarrier
\subsubsection{Feature extraction by individual neurons}
The population of ascending neurons extracts and encodes a number of different
features of the preprocessed signal, and hence represents the signal in a
higher-dimensional space than the preceding receptor neurons and local
interneurons~(\bcite{clemens2011efficient}). Each ascending neuron is assumed
to scan the signal for a specific template pattern, which can be thought of as
a kernel of a particular structure and on a particular time scale. This
process, known as template matching, can be modelled as a convolution of the
intensity-adapted envelope $\adapt(t)$ with a kernel $k_i(t)$ specific to the
$i$-th ascending neuron:
\begin{equation}
c_i(t)\,=\,\adapt(t)\,*\,k_i(t)
= \infint \adapt(\tau)\,\cdot\,k_i(t\,-\,\tau)\,d\tau
\label{eq:conv}
\end{equation}
We use Gabor kernels as basis functions for creating different template
patterns. Gabor functions presumably capture the essential structural
properties of the filter functions found in various auditory
neurons~(\bcite{rokem2006spike}; \bcite{clemens2011efficient};
\bcite{clemens2012nonlinear}). An arbitrary one-dimensional, real Gabor kernel
is generated by multiplication of a Gaussian envelope with standard deviation
or kernel width $\kwi$ and a sinusoidal carrier with frequency $\kfi$ and phase
$\kpi$:
\begin{equation}
k_i(t,\,\kwi,\,\kfi,\,\kpi)\,=\,e^{-\frac{t^{2}}{2{\kwi}^{2}}}\,\cdot\,\sin(\kfi\,t\,+\,\kpi), \qquad \kfi\,=\,2\pi f_{\text{sin}_i}
\label{eq:gabor}
\end{equation}
Different combinations of $\kwi$ and $\kfi$ result in Gabor kernels with
different lobe number $\kni$, which is the number of half-periods of the
carrier that fit under the Gaussian envelope within reasonable limits of
attenuation. The time window under the Gaussian envelope that contains the
relevant lobes of the kernel can be defined as Gaussian full duration at height
$\rh$ relative to the maximum of the Gaussian:
\begin{equation}
\fdrm(\kwi,\,\rh)\,=\,2\,\cdot\,\sqrt{-2\,\cdot\,\ln \rh}\cdot\,\kwi, \qquad \rh\,\in\,(0,\,1]
\label{eq:fdrm}
\end{equation}
% Yes, FDRM is a hideous acronym. Based on the common "full width at half
% maximum" (FWHM) and adjusted because "full duration at half maximum" (FDHM)
% is apparently preferred in a temporal context. Alternatively, "w_\text{gauss}"?
With this, an appropriate carrier frequency $\kfi$ for obtaining a Gabor kernel
with width $\kwi$ and desired lobe number $\kni$ can be approximated as
\begin{equation}
\kfi(\kni,\,\kwi,\,\rh)\,=\,\frac{0.5\,\cdot\,\kni\,+\,\beta_0}{\fdrm(\kwi,\,\rh)}, \qquad \kni\,\geq\,2\enspace\forall\enspace \kni\,\in\,\mathbb{Z}
\label{eq:gabor_freq}
\end{equation}
% \begin{equation}
% \kfi(\kni,\,\kwi,\,\rh)\,=\,\frac{0.5\,\cdot\,\kni\,+\,\beta_0}{2\,\cdot\,\sqrt{-2\,\cdot\,\ln \rh}\cdot\kwi}, \qquad \kni\,\geq\,2\enspace\forall\enspace \kni\,\in\,\mathbb{Z}
% \end{equation}
The relationship between $\kfi$ and $\kni$ is approximately linear except for
small $\kni$. The offset term $\beta_0\approx0.26$ was added to balance the
amplitudes of the $\kni$ desired lobes of the kernel --- which should be
maximized --- against the amplitudes of the next-outer lobes, which should not
exceed the threshold value determined by $\rh$. Note that simple Gaussian
kernels with $\kni=1$ can be obtained by setting the carrier frequency to
$\kfi=0$ and are hence not covered by Eq.\,\ref{eq:gabor_freq}.
Carrier phase $\kpi$ determines the position of the kernel lobes relative to
the kernel center. We restrict the Gabor kernels to be either even or odd
functions by setting $\kpi$ to one of only four specific phase
values~(Tab.\,\ref{tab:gabor_phases}). Even Gabor kernels are mirror-symmetric
with uneven $\kni$, whereas odd Gabor kernels are point-symmetric with even
$\kni$. Both even and odd kernels can have either positive or negative sign,
which refers to the sign of the kernel's central lobe (even kernels) or the
left of the two central lobes (odd kernels). These four major groups of Gabor
kernels allow for the extraction of different types of signal features, such as
the presence of peaks (even, $+$), troughs (even, $-$), onsets (odd, $+$), and
offsets (odd, $-$) at various time scales.
\FloatBarrier
\begin{table}[!ht]
\centering
\captionsetup{width=.45\textwidth}
\caption{Values of phase $\kp$ that are specific for the four major groups
of Gabor kernels.}
\begin{tabular}{|ccc|}
\hline
sign & even kernels & odd kernels\\
\hline
$+$ & $+\pi\,/\,2$ & $\pi$\\
$-$ & $-\pi\,/\,2$ & $0$\\
\hline
\end{tabular}
\label{tab:gabor_phases}
\end{table}
\FloatBarrier
Following the convolutional template matching~(Fig.\,\ref{fig:stages_feat}a),
each kernel-specific response $c_i(t)$ is passed through a shifted Heaviside
step-function $\nl$ with threshold value $\thr$ to obtain a binary
response~(Fig.\,\ref{fig:stages_feat}b):
\begin{equation}
b_i(t,\,\thr)\,=\,\begin{cases}
\;1, \quad c_i(t)\,>\,\thr\\
\;0, \quad c_i(t)\,\leq\,\thr
\end{cases}
\label{eq:binary}
\end{equation}
The thresholding of $c_i(t)$ into $b_i(t)$ can be thought of as a
categorization into "relevant" and "irrelevant" response values. Similar
thresholding nonlinearities have been a crucial processing step in previous
models that deal with the extraction of behaviorally relevant song features in
insects~(\bcite{clemens2013computational}; \bcite{clemens2013feature};
\bcite{hennig2014time}; \bcite{ronacher2015computational}).
% However, there is no direct physiological evidence that would allow to
% determine the exact location or underlying mechanism of such a nonlinearity in
% either the ascending neurons or at some point further downstream in the SEG.
In the grasshopper, the responses of the ascending neurons are assumed to be
integrated somewhere in the SEG~(\bcite{ronacher1986routes};
\bcite{bauer1987separate}; \bcite{bhavsar2017brain}). In the model pathway,
temporal integration is implemented as temporal averaging of the binary
responses $b_i(t)$ by a lowpass filter with extremely low cutoff frequency:
\begin{equation}
f_i(t)\,=\,b_i(t)\,*\,\lp, \qquad \fc\,=\,1\,\text{Hz}
\label{eq:lowpass}
\end{equation}
This processing step results in a set of slowly changing kernel-specific
features $f_i(t)$, which is the final representation along the model
pathway~(Fig.\,\ref{fig:stages_feat}c). In the resulting high-dimensional
feature space, different species-specific song patterns can be distinguished by
their distinct combination of feature values, e.\,g. using Euclidian geometry
or a simple linear classifier.
\begin{figure}[!ht]
\centering
\includegraphics[width=\textwidth]{figures/fig_feat_stages.pdf}
\caption{\textbf{Song representations during the feature extraction stage.}
Example song of \textit{O. rufipes}.
Different color shades indicate different types of Gabor
kernels with specific lobe number $\kni$ and either $+$ or
$-$ sign, sorted (dark to light) first by increasing
$\kni$ and then by sign~($1\,\leq\,\kni\,\leq\,4$; first
$+$, then $-$ for each $\kni$; two kernel widths $\kwi$ of
$4\,$ms and $32\,$ms per type; 8 types, 16 kernels in
total).
\textbf{a}:~Kernel-specific filter responses $c_i(t)$.
\textbf{b}:~Binary responses $b_i(t)$.
\textbf{c}:~Finalized features $f_i(t)$.}
\label{fig:stages_feat}
\end{figure}
\FloatBarrier
\subsection{Simulation-based analysis of the model pathway}
\subsubsection{Data sourcing}
All simulations were based on a dataset that was assembled from five different
sources, each of which is an established reference for the identification of
European grasshopper species. The dataset was limited to six species from the
species-rich \textit{Gomphocerinae} sub-family that are known to be common
throughout Central and Southern Europe. All recordings were converted to
standard~\textit{.wav}~format with a sampling rate of~44.1\,kHz and an
amplitude scale in arbitrary units. Individual songs were then cut from each
recording. The dataset includes a total of 31 recordings across species, which
amounts to a total of 153 isolated songs. However, the number of available
species-specific songs varies greatly across species, with a maximum of 48
songs for \textit{C. biguttulus} and a minimum of 6 songs for \textit{C.
mollis}~(Tab.\,\ref{tab:species_list}).
\begin{itemize}
\item "Heuschrecken beobachten, bestimmen" by~Heiko~Bellmann\\
1$^{\text{st}}$\,edition, 1993, Naturbuch, Augsburg
\item "Gesänge der heimischen Heuschrecken. Akustisch-optische
Bestimmungshilfe."\\
by~Karl-Heinz~Garberding, Deutscher Jugendbund für Naturbeobachtung\\
1$^{\text{st}}$\,edition, 2001, DJN, Göttingen
\item "Heuschrecken -- Die Stimmen von 61 heimischen Arten"
by~Heiko~Bellmann\\
1$^{\text{st}}$\,edition, 2004, AMPLE, Germering
\item "Fauna d'Italia XLVIII -- Orthoptera" by~Bruno~Massa, Paolo~Fontana,
Filippo~M.~Buzzetti, Roy~M.J.C.~Kleukers, Baudewijn~Odé\\
1$^{\text{st}}$\,edition, 2012, edagricola, Milano
\item "Singing Orthoptera of Slovenia" by~Stanislav~Gomboc, Blaz~Segula\\
1$^{\text{st}}$\,edition, 2014, EGEA, Ljubljana
\end{itemize}
\begin{table}[!ht]
\centering
\captionsetup{width=.75\textwidth}
\caption{Overview of the six grasshopper species from the
\textit{Gomphocerinae} sub-family, the number of sources per species, the
number of available recordings across sources, and the number of isolated
songs across recordings.}
\begin{tabular}{|lccc|}
\hline
\textbf{Species} & \textbf{Sources} & \textbf{Recordings} & \textbf{Songs}\\
\hline
\textit{Chorthippus biguttulus} & 5 & 6 & 48\\
\textit{Chorthippus mollis} & 3 & 3 & 6\\
\textit{Chrysochraon dispar} & 4 & 5 & 45\\
\textit{Gomphocerippus rufus} & 4 & 8 & 16\\
\textit{Omocestus rufipes} & 4 & 5 & 14\\
\textit{Pseudochorthippus parallelus} & 4 & 4 & 24\\
\hline
\end{tabular}
\label{tab:species_list}
\end{table}
\subsubsection{Generation of synthetic input signals}
Different processing steps along the model pathway were tested for intensity
invariance by generating synthetic input signals $x(t)$ of varying intensity,
transforming them through the respective processing steps, and comparing the
resulting signal representations. Inputs were generated for two distinct cases.
In the idealized, noiseless case, $x(t)$ consists of a song component $\soc(t)$
with $\ssig=1$ and a multiplicative scale $\sca$:
\begin{equation}
x(t)\,=\,\sca\,\cdot\,\soc(t), \qquad \sca\,\geq\,0
\label{eq:noiseless}
\end{equation}
In the noiseless case, $x(t)$ is hence only a scaled version of $\soc(t)$ with
$\xsig=\sca$. In the more realistic, noisy case, $x(t)$ consists of the same
song component $\soc(t)$ scaled by $\sca$ and an additive noise component
$\noc(t)$ with $\nsig=1$:
\begin{equation}
x(t)\,=\,\sca\,\cdot\,\soc(t)\,+\,\noc(t), \qquad \sca\,\geq\,0
\label{eq:noisy}
\end{equation}
Accordingly, the signal-to-noise ratio (SNR) of input $x(t)$ in the noisy case
equals the squared $\sca$ value:
\begin{equation}
\text{SNR}_x(\sca)\,=\,\frac{(\sca\,\cdot\,\ssig)^2}{\nsig^2}\,=\,\sca^2, \qquad \ssig\,=\,\nsig\,=\,1
\label{eq:input_snr}
\end{equation}
For most analyses, it would be sufficient if input $x(t)$ corresponds to the
signal representation immediately before the first of the tested
transformations. For instance, when testing the effects of logarithmic
compression~(Eq.\,\ref{eq:log}), $x(t)$ would correspond to the signal envelope
$\env(t)$. However, in this particular case, $\env(t)$ results from a nonlinear
transformation~(Eq.\,\ref{eq:env}), which cannot be synthesized as an additive
mixture of $\soc(t)$ and $\noc(t)$. For this reason, any input $x(t)$ across
all analyses corresponds not to the representation immediately before the
tested transformations but its predecessor representation instead. Therefore,
when testing logarithmic compression, $x(t)$ corresponds to the tympanal signal
$\filt(t)$ instead of $\env(t)$.
The raw $\soc(t)$ was drawn from the dataset of isolated species-specific song
recordings, whereas the raw $\noc(t)$ consists of a segment of normally
distributed white noise. Both $\soc(t)$ and $\noc(t)$ were normalized to unit
standard deviation. These can be used without further processing for all
analyses where input $x(t)$ corresponds to $\raw(t)$. For analyses where $x(t)$
corresponds to a later representation, $\soc(t)$ and $\noc(t)$ were first
processed along the model pathway up to the required representation, again
normalized to unit standard deviation, and then used to generate $x(t)$
according to either Eq.\,\ref{eq:noiseless} in the noiseless case or
Eq.\,\ref{eq:noisy} in the noisy case.
\subsubsection{Quantifying signal intensity across representations}
\label{sec:intensity_measures}
All intensity measures were calculated over a manually labeled segment within
each song. Segments always excluded the first and last few syllables to allow
slowly changing representations such as $f_i(t)$ to stabilize. The duration of
each segment and the number of contained syllables depends on the duration of
the species-specific song. Care was taken to ensure that the segment contained
a sufficient number of syllables to obtain a reliable estimate of the intensity
measures.
The standard deviation $\sigma$ was used as a measure of intensity for all
representations resulting from the transformation of input $x(t)$ up to and
including the kernel responses $c_i(t)$, for which individual $\sigma_{c_i}$
were used as kernel-specific intensity measures. The binary responses $b_i(t)$
were deemed to similar to the features $f_i(t)$ to warrant their own intensity
measure and were hence omitted from all related analyses. For $f_i(t)$,
$\sigma$ is not an appropriate intensity measure because each $f_i(t)$ is
ideally constant with $\sigma=0$ for the duration of a song. Therefore, the
average value $\muf$ of each $f_i(t)$ was used as a kernel-specific intensity
measure instead.
It is arguably not ideal to quantify the intensity of $c_i(t)$ and $f_i(t)$
separately for each kernel. Overall, these representations are not separate
signals bundled together but rather a set that acts as a unit with a single
intensity measure. However, there is no straightforward way to quantify the
intensity of $c_i(t)$ or $f_i(t)$ as a whole that would not entail a certain
ambiguity, e.\,g by averaging across kernels. In this sense, we opted for the
kernel-specific approach because it allows to asses differences in the
dependency on $\sca$ between individual members of either $c_i(t)$ and
$f_i(t)$.
The absolute intensity measures allow to compare the intensity of a
representation across different $\sca$ values. Additionally, ratios were
calculated between the intensity measures for $\sca>0$ and the respective
pure-noise reference measure for $\sca=0$ to better compare the intensities of
different representations. This is only possible in the noisy case, where input
$x(t)=\noc(t)$ for $\sca=0$, whereas $x(t)=0$ for $\sca=0$ in the noiseless
case. At the level of input $x(t)$, the ratio of intensity measures depends on
the square root of $\sca$:
\begin{equation}
\frac{\xsig}{\nsig}\,=\,\sqrt{\frac{\xsig^2}{\nsig^2}}\,=\,\sqrt{\frac{(\sca\,\cdot\,\ssig)^2\,+\,\nsig^2}{\nsig^2}}\,=\,\sqrt{\sca^2\,+\,1}, \qquad \ssig\,=\,\nsig\,=\,1
\label{eq:input_ratio}
\end{equation}
This holds only if $\soc(t)\perp\noc(t)$, so that $\xsig^2=\ssig^2+\nsig^2$,
which is a reasonable assumption for the raw $\soc(t)$ and $\noc(t)$. However,
the dependency of the ratio on $\sca$ is not necessarily the same for
representations that are transformed from $x(t)$ by nonlinear operations, since
these change the relationship of $\soc(t)$ and $\noc(t)$ in an unpredictable
fashion~(see appendix Fig.\,\ref{fig:app_env-sd}). Furthermore, the ratio is
not a proper SNR of the representation because it does not relate $\soc(t)$ to
$\noc(t)$ within the representation but rather the entire representation to
$\noc(t)$ alone. However, it still provides a useful measure of the relative
intensity of a representation with and without $\soc(t)$, which is the closest
we can get to the SNR of the representation. As such, the ratio of intensity
measures is referred to as SNR in the following.
% Is this legal? "SNR" is much shorter than "ratio of intensity measure to the pure-noise reference measure".
% Haven't used it much yet, sticked to "ratio" in most cases.
\subsection{Field data-based analysis of the model pathway}
Field recordings were taken on a meadow in the vicinity of the University of
Tübingen, Germany, during the day in August~2024. All recordings were taken
using a custom hand-held microphone array that was assembled from eight
omnidirectional AV-TEFE TCM141 condenser microphones. The microphones were
arranged in a linear configuration with a spacing of 30\,cm between adjacent
microphones and oriented in the same direction along the axis of the array. All
microphones were connected to a custom 8-channel amplificitation and
digitization system based on a Teensy 4.1 microcontroller with real-time clock
and microSD card storage. Recordings were written to the microSD card
in~\textit{.wav}~format with a sampling rate of 96\,kHz and an amplitude scale
in arbitrary units. The microphone array was held at a height of approximately
30\,cm above the ground, which was slightly above the height of most
surrounding vegetation and at the same height as the singing grasshopper. The
array was moved as close to the grasshopper as possible without interrupting
its song production, which amounts to an approximate offset distance of 10\,cm
between the animal and the leading microphone. Care was taken to maintain a
stable position and height of the microphone array during recording. The
resulting recordings were then processed through the model pathway and analyzed
according to the procedure described in Section~\ref{sec:intensity_measures}.
\subsection{Determining kernel-specific threshold values}
Different kernels $k_i(t)$ result in specific kernel responses $c_i(t)$,
Eq.\,\ref{eq:conv}, which are then transformed further into binary responses
$b_i(t)$, Eq.\,\ref{eq:binary}, by thresholding nonlinearity $\nl$. The
threshold value $\thr$ is specific to each $k_i(t)$. Across all analyses,
$\thr$ has been specified as a multiple of the pure-noise reference standard
deviation $\sigma_{c_i}$ for input $x(t)=\noc(t)$. This ensures that $\thr$ as
well as the resulting $b_i(t)$ and $f_i(t)$ are comparable across different
$k_i(t)$ because each pure-noise $c_i(t)$ approximately follows a normal
distribution around zero~(see appendix
Figs.\,\ref{fig:app_thresh-lp_kern-sd}-\ref{fig:app_field_kern-sd}).
\newpage
\section{Results}
\subsection{Mechanisms driving the emergence of intensity invariance}
It is not necessary to test each processing step along the model pathway for
intensity invariance. Instead, we can focus on those steps that involve
nonlinear transformations, since these are the only steps that can potentially
change the dependency on scale $\sca$ between the input and output
representations. Overall, there are three nonlinear transformations along the
model pathway: Full-wave rectification during envelope extraction, logarithmic
compression, and the thresholding nonlinearity during feature extraction. In
the following, we analyze the effects of each of these transformations on the
intensity and SNR of the resulting representations as well as their potential
contribution to intensity invariance.
\subsubsection{Full-wave rectification \& lowpass filtering}
The first nonlinear transformation along the model pathway is the full-wave
rectification of the tympanal signal $\filt(t)$ during the extraction of the
signal envelope (Eq.\,\ref{eq:env}). Rectification transforms the distribution
of $\filt(t)$ from an approximately zero-centered distribution with both
positive and negative values into a strictly non-negative distribution. Signal
envelope $\env(t)$ is then obtained by lowpass filtering the rectified
$\filt(t)$. The effects of this transformation pair on SNR and potential
intensity invariance were analyzed by rescaling and processing the input signal
$\raw(t)$ and comparing standard deviations between the resulting $\filt(t)$
and $\env(t)$, once for the noiseless case~(Fig.\,\ref{fig:rect-lp}a) and once
for the noisy case~(Fig.\,\ref{fig:rect-lp}b). In addition, the cutoff
frequency $\fc$ of the lowpass filter was varied to investigate the influence
of different filter bandwidths. In the noiseless case, the standard deviations
of $\filt(t)$ and $\env(t)$ are each reduced compared to the input $\raw(t)$ by
a multiplicative factor. These factors are constant across all $\sca$, which
results in a downward shift of the respective curve on a double-logarithmic
scale, away from the diagonal~(Fig.\,\ref{fig:rect-lp}c). For $\filt(t)$, the
reduction is a consequence of the bandpass filtering~(Eq.\,\ref{eq:bandpass})
of $\raw(t)$. For $\env(t)$, the standard deviation is further reduced compared
to $\filt(t)$. Rectification contributes much less to this reduction than
lowpass filtering. The degree of reduction by lowpass filtering depends on the
cutoff frequency $\fc$, with lower $\fc$ (narrow bandwidth) resulting in a
stronger reduction. In the noisy case, the standard deviations of $\filt(t)$
and $\env(t)$ can be related to the respective pure-noise reference standard
deviation~(Fig.\,\ref{fig:rect-lp}d). This causes each curve to start with a
constant regime of SNR values near 1 for smaller $\sca$, which reflects the
dominance of the noise component $\noc(t)$ over the song component $\soc(t)$ in
the input $\raw(t)$. For larger $\sca$, all curves transition into a regime of
linearly increasing SNR on a double-logarithmic scale. For $\filt(t)$, the
linear part of the curve deviates only slightly from the diagonal. For
$\env(t)$, however, the transition occurs at lower $\sca$ compared to
$\filt(t)$, and the linear part of the curve is shifted leftward away from the
diagonal, which means that higher SNR values are achieved for the same $\sca$.
This effect is more pronounced for lower $\fc$ of the lowpass filter and is
presumably caused by the attenuation of high-frequency components in the
signal, which are more prominent in the noise component $\noc(t)$ than in the
song component $\soc(t)$. The effect also appears relatively consistent across
different species, although small variations exist~(Fig.\,\ref{fig:rect-lp}e
and appendix Fig.\,\ref{fig:app_rect-lp}). In summary, the standard deviation
of $\env(t)$ has never been observed to saturate for larger $\sca$ but rather
continues to increase proportionally to $\sca$ for all tested $\fc$, in both
the noiseless and the noisy case and across different species. Consequently,
the combination of rectification and lowpass filtering does not contribute to
intensity invariance. However, this transformation pair does improve the SNR of
$\env(t)$ relative to $\filt(t)$ and thus provides subsequent processing stages
with a more robust input representation and higher input SNR.
\begin{figure}[!ht]
\centering
\includegraphics[width=\textwidth]{figures/fig_invariance_rect_lp.pdf}
\caption{\textbf{Rectification and lowpass filtering improves SNR
but does not contribute to intensity invariance.}
Input $\raw(t)$ consists of $\soc(t)$ scaled by $\sca$ with
optional $\noc(t)$ and is successively transformed into
tympanal signal $\filt(t)$ and envelope $\env(t)$.
\textbf{Top}:~Examples of $\filt(t)$ and $\env(t)$ for
different $\sca$.
\textbf{a}:~Noiseless case.
\textbf{b}:~Noisy case.
\textbf{Bottom}:~Intensity measures over $\sca$. Different
line styles indicate different cutoff frequencies $\fc$ of the
lowpass filter extracting $\env(t)$.
\textbf{c}:~Noiseless case: Standard deviation $\sigma_x$ of
$\filt(t)$ and $\env(t)$, respectively.
\textbf{d}:~Noisy case: Ratio of $\sigma_x$ to the respective
pure-noise reference $\sigma_{\eta}$ for $\sca=0$.
\textbf{e}:~Ratio of $\sigma_x$ to $\sigma_{\eta}$ of
$\env(t)$ as in \textbf{d} for different species (averaged
over songs and recordings, appendix
Fig.\,\ref{fig:app_rect-lp}).
}
\label{fig:rect-lp}
\end{figure}
\FloatBarrier
\subsubsection{Logarithmic compression \& spike-frequency adaptation}
The second nonlinear transformation along the model pathway is the logarithmic
compression of the signal envelope $\env(t)$ into $\db(t)$, Eq.\,\ref{eq:log},
which is then followed by the highpass filtering of $\db(t)$,
Eq.\,\ref{eq:highpass}, to obtain the intensity-adapted envelope $\adapt(t)$.
The interplay of this transformation pair was analyzed by rescaling and
processing the input signal $\filt(t)$ and comparing standard deviations
between the resulting $\env(t)$, $\db(t)$, and $\adapt(t)$. It is necessary to
use $\filt(t)$ as input for this analysis instead of $\env(t)$, because
$\env(t)$ results from a nonlinear transformation and hence cannot be
synthesized as an additive mixture of song component $\soc(t)$ and noise
component $\noc(t)$. % <-- Sentence may be methods section material.
However, it is much easier to conceive a mathematical description of the
effects of logarithmic compression and adaptation if $\env(t)$ itself is
assumed to be composed of $\soc(t)$ and $\noc(t)$. In the noiseless
case~(Fig.\,\ref{fig:log-hp}a), $\env(t)$ takes the form of
\begin{equation}
\env(t)\,=\,\sca\,\cdot\,\soc(t), \qquad \env(t)\,>\,0\enspace\forall\enspace t\,\in\,\mathbb{R}
\label{eq:toy_env_pure}
\end{equation}
The standard deviation of $\env(t)$ increases linearly with $\sca$ on a
double-logarithmic scale and is slightly reduced~(Fig.\,\ref{fig:log-hp}c)
compared to the input $\filt(t)$, which is consistent with the results of the
previous analysis~(Fig.\,\ref{fig:rect-lp}c). By conversion of $\env(t)$ to
decibel scale, $\sca$ turns from a multiplicative scale in linear space into an
additive term, or offset, in logarithmic space:
\begin{equation}
\db(t)\,=\,20\,\cdot\,\dec \left[\,\sca\,\cdot\,s(t)\,\right]\,=\,20\,\cdot\,\left[\dec \sca\,+\,\dec s(t)\right], \qquad \sca\,>\,0
\label{eq:toy_log_pure}
\end{equation}
The highpass filtering of $\db(t)$ can be approximated as a subtraction of the
local signal offset within a suitable time interval $0 \ll \thp <
\frac{1}{\fc}$:
\begin{equation}
\begin{split}
\adapt(t)\,\approx\,\db(t)\,-\,20\,\cdot\,\dec \sca\,=\,20\,\cdot\,\dec s(t)
\end{split}
\label{eq:toy_highpass_pure}
\end{equation}
This eliminates $\sca$ from $\adapt(t)$ and thus renders it perfectly
intensity-invariant, with a constant standard deviation of around 10\,dB across
all $\sca>0$~(Fig.\,\ref{fig:log-hp}c). In contrast, in the noisy
case~(Fig.\,\ref{fig:log-hp}b), $\env(t)$ takes the form of
\begin{equation}
\env(t)\,=\,\sca\,\cdot\,\soc(t)\,+\,\noc(t), \qquad \env(t)\,>\,0\enspace\forall\enspace t\,\in\,\mathbb{R}
\label{eq:toy_env_noise}
\end{equation}
Similar to the previous analysis~(Fig.\,\ref{fig:rect-lp}d), the ratio of the
standard deviation of $\env(t)$ to its pure-noise reference standard deviation
on a double-logarithmic scale follows a constant regime for small $\sca$ and a
linearly increasing regime for larger $\sca$~(Fig.\,\ref{fig:log-hp}d). Decibel
conversion of $\env(t)$
% \begin{equation}
% \begin{split}
% \db(t)\,&=\,20\,\cdot\,\dec \left[\,\sca\,\cdot\,s(t)\,+\,\eta(t)\,\right]\\
% &=\,20\,\cdot\,\left(\dec \sca\,+\,\dec \left[s(t)\,+\,\frac{\eta(t)}{\sca}\right]\right), \qquad \sca\,>\,0
% \end{split}
% \label{eq:toy_log_noise}
% \end{equation}
\begin{equation}
\db(t)\,=\,20\,\cdot\,\left(\dec \sca\,+\,\dec \left[s(t)\,+\,\frac{\eta(t)}{\sca}\right]\right), \qquad \sca\,>\,0
\label{eq:toy_log_noise}
\end{equation}
allows for the separation of $\sca$ from $\soc(t)$ but introduces a scaling of
$\noc(t)$ by the inverse of $\sca$, which remains present even after the offset
subtraction:
\begin{equation}
\begin{split}
\adapt(t)\,\approx\,20\,\cdot\,\dec\left[s(t)\,+\,\frac{\eta(t)}{\sca}\right]
\end{split}
\label{eq:toy_highpass_noise}
\end{equation}
% \begin{equation}
% \begin{split}
% \adapt(t)\,\approx\,\db(t)\,-\,20\,\cdot\,\dec \sca\,=\,20\,\cdot\,\dec\left[s(t)\,+\,\frac{\eta(t)}{\sca}\right]
% \end{split}
% \label{eq:toy_highpass_noise}
% \end{equation}
This means that, in the noisy case, $\sca$ cannot be entirely eliminated from
$\adapt(t)$, only redistributed between $\soc(t)$ and $\noc(t)$. If $\sca$ is
sufficiently large ($\sca\gg1$, saturation regime), $\noc(t)$ is attenuated to
the point of being negligible, so that $\adapt(t)$ is a scale-free
representation of $\soc(t)$. If $\sca$ and $\noc(t)$ are at similar scales
($\sca\approx1$, transient regime), $\adapt(t)$ largely resembles $\db(t)$.
Finally, if $\sca$ is sufficiently small ($0<\sca\ll1$, noise regime),
$\noc(t)$ masks $\soc(t)$ even after the intensity adaptation. Accordingly, the
effective intensity invariance of $\adapt(t)$ through logarithmic compression
and adaptation is limited by the SNR of $\env(t)$: Songs that have already
sunken into the noise floor at the level of $\env(t)$ cannot be recovered by
subsequent processing steps. The general pattern of noise regime, transient
regime, and saturation regime remains consistent across different
species~(Fig.\,\ref{fig:log-hp}e). However, the saturation point --- the $\sca$
value at which the SNR of $\adapt(t)$ starts to saturate --- and the saturation
level --- the constant SNR of $\adapt(t)$ within the saturation regime --- vary
considerably between and within species~(appendix
Figs.\,\ref{fig:app_log-hp_curves}+\ref{fig:app_log-hp_saturation}). For
example, \textit{C. biguttulus} and \textit{C. mollis} display a noticably
lower saturation level compared to other species. These differences are not to
be underestimated, since the saturation level of $\adapt(t)$ determines the
maximum input SNR for subsequent processing steps. In other words, the fact
that $\adapt(t)$ eventually reaches a saturation regime is, of course,
desirable in the context of intensity invariance, but it also means to pass up
on the higher SNR values that are achieved by $\env(t)$ for the same $\sca$ (up
to several orders of magnitude, Fig.\,\ref{fig:log-hp}d). This trade-off
between intensity invariance and SNR is a recurring phenomenon that is further
addressed in the following sections.
\begin{figure}[!ht]
\centering
\includegraphics[width=\textwidth]{figures/fig_invariance_log_hp.pdf}
\caption{\textbf{Intensity invariance through logarithmic compression and
adaptation is restricted by the noise floor and decreases
SNR.}
Input $\filt(t)$ consists of $\soc(t)$
scaled by $\sca$ with optional $\noc(t)$
and is successively transformed into envelope $\env(t)$,
logarithmically compressed envelope $\db(t)$, and
intensity-adapted envelope $\adapt(t)$.
\textbf{Top}:~Examples of $\env(t)$, $\db(t)$, and
$\adapt(t)$ for different $\sca$.
\textbf{a}:~Noiseless case.
\textbf{b}:~Noisy case.
\textbf{Bottom}:~Intensity measures over $\sca$.
\textbf{c}:~Noiseless case: Standard deviation $\sigma_x$
of $\env(t)$, $\db(t)$, and $\adapt(t)$, respectively.
\textbf{d}:~Noisy case: Ratio of $\sigma_x$ to the
respective pure-noise reference $\sigma_{\eta}$ for
$\sca=0$. Shaded areas indicate $5\,\%$ (dark grey) and
$95\,\%$ (light grey) curve span for $\adapt(t)$.
\textbf{e}:~Ratio of $\sigma_x$ to $\sigma_{\eta}$ of
$\adapt(t)$ as in \textbf{d} for different species
(averaged over songs and recordings, appendix
Fig.\,\ref{fig:app_log-hp_curves}). Dots indicate $95\,\%$
curve span per species.
}
\label{fig:log-hp}
\end{figure}
\FloatBarrier
\subsubsection{Thresholding nonlinearity \& temporal averaging}
The third nonlinear transformation along the model pathway is the thresholding
nonlinearity $\nl$ that transforms each kernel response $c_i(t)$ into a binary
binary response $b_i(t)$, Eq.\,\ref{eq:binary}. This transformation takes place
after the convolutional filtering of $\adapt(t)$ with kernel $k_i(t)$,
Eq.\,\ref{eq:conv}, and is followed by the temporal averaging of $b_i(t)$ into
the feature set $f_i(t)$ by a lowpass filter, Eq.\,\ref{eq:lowpass}. The
effects of thresholding and temporal averaging are best illustrated based on a
single kernel~(Fig.\,\ref{fig:thresh-lp_single}) instead of the full set. For
this analysis, input $\adapt(t)$ was
rescaled~(Fig.\,\ref{fig:thresh-lp_single}a) and convolved with kernel $k(t)$.
The resulting kernel response $c(t)$ was passed through $H(c\,-\,\Theta)$ with
three different threshold values
$\Theta$~(Fig.\,\ref{fig:thresh-lp_single}b-d). Each resulting binary response
$b(t)$ was transformed into $f(t)$, whose average feature value $\mu_f$ serves
as a measure of intensity~(Fig.\,\ref{fig:thresh-lp_single}ef). The
thresholding nonlinearity $H(c\,-\,\Theta)$ categorizes the values of $c(t)$
into "relevant" ($c(t)>\Theta$, $b(t)=1$) and "irrelevant" ($c(t)\leq\Theta$,
$b(t)=0$) response values. It thereby splits the probability density $\pc$ of
$c(t)$ within some observed time interval $T$ into two complementary parts
around $\Theta$:
\begin{equation}
\int_{\Theta}^{+\infty} \pc\,dc\,=\,1\,-\,\int_{-\infty}^{\Theta} \pc\,dc\,=\,\frac{T_1}{T}, \qquad \infint \pc\,dc\,=\,1
\label{eq:pdf_split}
\end{equation}
The right-sided part of the split $\pc$ corresponds to time $T_1$ where
$c(t)>\Theta$, while the left-sided part corresponds to time $T_0=T-T_1$ where
$c(t)\leq\Theta$. The semi-definite integral over the right-sided part of $\pc$
represents the ratio of time $T_1$ to total time $T$ because the indefinite
integral of a probability density is normalized to 1. The lowpass filtering of
$b(t)$ can be approximated as temporal averaging over a suitable time interval
$\tlp>\frac{1}{\fc}$ in order to express $f(t)$ as a similar temporal ratio
\begin{equation}
f(t)\,\approx\,\frac{1}{\tlp} \int_{t}^{t\,+\,\tlp} b(\tau)\,d\tau\,=\,\frac{T_1}{\tlp}, \qquad b(t)\,\in\,\{0,\,1\}
\label{eq:feat_avg}
\end{equation}
of time $T_1$ during which $b(t)$ is 1 within the averaging interval $\tlp$.
Therefore, the value of $f(t)$ at every time point $t$ approximately signifies
the cumulative probability that $c(t)$ exceeds $\Theta$ during the
corresponding averaging interval $\tlp$:
\begin{equation}
f(t)\,\approx\,\int_{\Theta}^{+\infty} \pclp\,dc\,=\,P(c\,>\,\Theta,\,\tlp)
\label{eq:feat_prop}
\end{equation}
In a sense, $f(t)$ can be interpreted as some sort of duty cycle with respect
to $\Theta$. For example, a feature value of $f(t)=0.4$ means that $c(t)$
exceeds $\Theta$ for approximately 40\,\% of the time within $\tlp$ around $t$.
In the most extreme cases, $\Theta$ lays either above the maximum of $c(t)$ or
below the minimum of $c(t)$, which results in a minimum or maximum possible
feature value of $f(t)=0$~(Fig.\,\ref{fig:thresh-lp_single}d, left column) or
$f(t)=1$, respectively.
Importantly, $f(t)$ neither retains information about the timing of individual
threshold crossings nor the precise values of $c(t)$ apart from their relation
to $\Theta$. Accordingly, for a given $\Theta$, different $\sca$ can still
result in similar $T_1$ segments (and hence similar feature values) depending
on the magnitude of the derivative of $c(t)$ in temporal proximity to time
points at which $c(t)$ crosses $\Theta$: The steeper the slope of $c(t)$, the
less $T_1$ changes with variations in $\sca$. The most reliable way of
exploiting this invariant porperty of $f(t)$ is to set $\Theta$ to a value near
0, because these values are least affected by different scales of $c(t)$. For
sufficiently large $\sca$, $f(t)$ then approaches the same constant $\mu_f$ in
both the noiseless and the noisy case~(Fig.\,\ref{fig:thresh-lp_single}e,
saturation regime).
The saturation level of $f(t)$ is independent of the precise value of $\Theta$,
but the saturation point decreases with
$\Theta$~(Fig.\,\ref{fig:thresh-lp_single}e). Therefore, a threshold value of
$\Theta=0$ would be the optimal choice for achieving intensity invariance at
the lowest possible $\sca$. In stark contrast, the closer $\Theta$ is to 0, the
higher $\mu_f$ in response to the pure noise component $\noc(t)$ and the lower
the resulting SNR of $f(t)$ between noise regime and saturation
regime~(Fig.\,\ref{fig:thresh-lp_single}b-d, left column, and
Fig.\,\ref{fig:thresh-lp_single}e). This trade-off between intensity invariance
and SNR has already been observed during the previous analysis on logarithmic
compression and adaptation~(Fig.\,\ref{fig:log-hp}d).
Finally, the effects of thresholding and temporal averaging must be seen in the
context of the previous transformation pair of logarithmic compression and
adaptation: In the current analysis, the input $\adapt(t)$ can be rescaled by
arbitrarily large $\sca$, while in the full pathway, the current input
$\adapt(t)$ is the output $\adapt(t)$ of the previous transformation pair and
is hence capped to a maximum standard deviation of around
10\,dB~(Fig.\,\ref{fig:log-hp}cd). This can be illustrated by plotting $\mu_f$
not over $\sca$~(Fig.\,\ref{fig:thresh-lp_single}e) but over the standard
deviation of input $\adapt(t)$ instead~(Fig.\,\ref{fig:thresh-lp_single}f). It
becomes apparent that $\mu_f$ saturates only for standard deviations of
$\adapt(t)$ that would already be capped. Accordingly, $f(t)$ never reaches the
saturation regime as determined by the current transformation pair but rather
adheres to the saturation regime determined by the previous transformation
pair. In this case, the saturated $\mu_f$ is not independent of $\Theta$
anymore. The consequences of this interaction between the two mechanisms of
intensity invariance are further explored in a later section.
\begin{figure}[!ht]
\centering
\includegraphics[width=\textwidth]{figures/fig_invariance_thresh_lp_single.pdf}
\caption{\textbf{Intensity invariance through thresholding and temporal
averaging is mediated by the interaction of threshold
value and noise floor.}
Input $\adapt(t)$ consists $\soc(t)$ scaled by $\sca$ with
optional $\noc(t)$ and is transformed into single kernel
response $c(t)$, binary response $b(t)$, and feature
$f(t)$. Different color shades indicate different
threshold values $\Theta$ (multiples of pure-noise
standard deviation $\sigma_{\eta}$ of $c(t)$ for $\sca=0$,
with darker colors for higher $\Theta$. See also appendix
Fig.\,\ref{fig:app_thresh-lp_kern-sd}).
\textbf{Left}:~Noisy case: Examples of $\adapt(t)$ as well
as $c(t)$, $b(t)$, and $f(t)$ for different $\sca$.
\textbf{a}:~$\adapt(t)$ with kernel $k(t)$ in black.
\textbf{b\,-\,d}: $c(t)$, $b(t)$, and $f(t)$ based on the
same $\adapt(t)$ from \textbf{a} but for different
$\Theta$.
\textbf{Right}:~Average value $\mu_f$ of $f(t)$ for each
$\Theta$ from \textbf{b\,-\,d}. Dots indicate $95\,\%$
curve span (noisy case).
\textbf{e}:~$\mu_f$ over $\sca$, once for the noisy case
(solid lines) and once for the noiseless case (dotted
lines).
\textbf{f}:~Noisy case: $\mu_f$ over standard deviation
$\sigma_{\text{adapt}}$ of input $\adapt$ corresponding to
$\sca$ shown in \textbf{e}. Shaded area indicates values
of $\sigma_{\text{adapt}}$ that are capped in the output
$\adapt(t)$ of the previous transformation pair
(Fig.\,\ref{fig:log-hp}cd).
}
\label{fig:thresh-lp_single}
\end{figure}
\FloatBarrier
\subsection{Intensity invariance of species-specific feature representations}
Having established both the meaning of the feature value and the mechanism of
intensity invariance by thresholding and temporal averaging, the question
remains how this mechanism acts on a set of features $f_i(t)$ based on
different species-specific songs~(Fig.\,\ref{fig:thresh-lp_species}a). The
previous analysis was repeated with three different kernels $k_i(t)$ using a
single kernel-specific threshold value $\thr$; and the resulting average
feature values $\muf$ were plotted over
$\sca$~(Fig.\,\ref{fig:thresh-lp_species}bc). Additionally, 2D feature spaces
spanned by each pair of $f_i(t)$ were plotted to investigate the separability
of species-specific songs based on the feature representation in dependence of
$\sca$~(Fig.\,\ref{fig:thresh-lp_species}de). Each species-specific combination
of $\muf$ follows a trajectory through feature space that develops with $\sca$.
These trajectories correspond to the transient regime between the constant
(noise) regime and the saturation regime, which are only visible as the start
and end points of the trajectories, respectively. The horizontal dashes in the
colorbars indicate the range of $\sca$ that corresponds to the transient regime
across $f_i(t)$ for each species.
In the noiseless case, each $\muf$ is 0 for small $\sca$ across all
species~(Fig.\,\ref{fig:thresh-lp_species}b) because $c_i(t)$ never exceeds
$\thr$. Accordingly, each trajectory starts at the origin of the feature
space~(Fig.\,\ref{fig:thresh-lp_species}d). For larger $\sca$, all $\muf$
saturate at individual values whose combination differs between species, so
that the songs of each species are eventually represented by distinct points in
feature space. However, the species-specific trajectories cross each other at
numerous points, which means that the songs of two species --- each at a
specific $\sca$ --- can result in the same combination of $\muf$. Furthermore,
the specific saturation point of $f_i(t)$ depends on the species: For
\textit{C. mollis}, all $\muf$ saturate around the same $\sca$, while
\textit{O. rufipes} exhibits considerable variation between the three $f_i(t)$.
The larger the variation in saturation points between $f_i(t)$, the stronger
the curvature of the trajectory through feature space.
In the noisy case, $\muf$ is non-zero even for the smallest
$\sca$~(Fig.\,\ref{fig:thresh-lp_species}c) because the addition of the noise
component $\noc(t)$ to input $\adapt(t)$ drives $c_i(t)$ above $\thr$
regardless of the song component $\soc(t)$. The starting value of $\muf$ is the
same across all $f_i(t)$ and species by construction of the specific $\thr$. In
consequence, the trajectories through feature space do not start at the origin
but rather at approximately the same point along the
diagonal~(Fig.\,\ref{fig:thresh-lp_species}e). For larger $\sca$, all $\muf$
saturate at the same values as in the noiseless case, as expected from the
previous analysis~(Fig.\,\ref{fig:thresh-lp_single}e). However, the
trajectories now move a much shorter distance through feature space for a
similar range of $\sca$ due to the lower SNR of $f_i(t)$ between noise regime
and saturation regime, which increases the likelihood of trajectories crossing
each other. Finally, the saturation points of $f_i(t)$ for a given species are
slightly higher in the noisy case, but the variation between $f_i(t)$ remains
largely unchanged.
In summary, even a comparably small set of three features $f_i(t)$ can, in
principle, represent different species-specific songs at distinct points in
feature space, regardless of the presence of noise. However, this only holds
for sufficiently large $\sca$ that allow $f_i(t)$ to reach a saturation regime.
During the transient regime, the species-specific combination of $\muf$ can
very well be the same for two or more different species at specific $\sca$,
although this may be alleviated by the inclusion of additional $f_i(t)$.
Overall, the results of this analysis suggest that $\thr$ should rather be
choosen in favor of a higher SNR ($\thr$ just above pure-noise $c_i(t)$) than a
lower saturation point ($\thr\to0$). First, because this reduces the density of
trajectories through feature space, and second, because the capping of
$\adapt(t)$ by the previous transformation pair likely renders the saturation
point of $f_i(t)$ less relevant.
\begin{figure}[!ht]
\centering
\includegraphics[width=\textwidth]{figures/fig_invariance_thresh_lp_species.pdf}
\caption{\textbf{Feature representation of different species-specific songs
saturates at different points in feature space.}
Same input and processing as in
Fig.\,\ref{fig:thresh-lp_single} but with three different
kernels $k_i$ and a single kernel-specific threshold value
$\thr=0.5\cdot\sigma_{\eta_i}$ (appendix
Fig.\,\ref{fig:app_thresh-lp_kern-sd}).
\textbf{a}:~Examples of species-specific grasshopper
songs.
\textbf{Middle}:~Average value $\muf$ of each feature
$f_i(t)$ over $\sca$ per species (averaged over songs and
recordings, appendix Figs.\,\ref{fig:app_thresh-lp_pure}
and \ref{fig:app_thresh-lp_noise}). Different color shades
indicate different $k_i$. Dots indicate $95\,\%$ curve
span per $k_i$.
\textbf{b}:~Noiseless case.
\textbf{c}:~Noisy case.
\textbf{Bottom}:~2D feature spaces spanned by each pair of
$f_i(t)$. Each trajectory corresponds to a
species-specific combination of $\muf$ that develops
with $\sca$ (colorbars). Horizontal dashes in the colorbar
indicate $5\,\%$ (dark grey) and $95\,\%$ (light grey)
curve span of the norm across all three $\muf$ per
species.
\textbf{d}:~Noiseless case.
\textbf{e}:~Noisy case. Shaded areas indicate the average
minimum $\muf$ across all species-specific trajectories.
}
\label{fig:thresh-lp_species}
\end{figure}
\FloatBarrier
\subsection{Intensity invariance along the full model pathway}
Through the previous analyses, we could establish two mechanisms of intensity
invariance: Logarithmic compression and adaptation as well as thresholding and
temporal averaging. While each transformation pair by itself can provide some
level of invariance, certain results suggest that the first mechanism may
actually limit or even nullify the effect of the second mechanism. In the
following sections, we investigate the combined effect of both mechanisms along
the full model pathway~(Fig.\,\ref{fig:pipeline_full}) and explore the
consequences of disabling the first mechanism by skipping the logarithmic
compression step~(Fig.\,\ref{fig:pipeline_short}).
\subsubsection{Including logarithmic compression}
For this analysis, input $\raw(t)$ --- including both song component $\soc(t)$
and noise component $\noc(t)$ --- was rescaled and processed throughout all
steps of the model pathway~(Fig.\,\ref{fig:pipeline_full}a) up to the feature
set $f_i(t)$. As before, the standard deviation was used as intensity measure
for each resulting representation except $b_i(t)$ and $f_i(t)$. For $f_i(t)$,
the average feature value $\muf$ was used, while $b_i(t)$ was omitted from the
analysis. Plotting each intensity measure over
$\sca$~(Fig.\,\ref{fig:pipeline_full}b) reinforces many of the previous
observations. For ease of visualization, the kernel-specific curves for
$c_i(t)$ and $f_i(t)$ were summarized by their median. Representations prior to
logarithmic compression --- $\filt(t)$ and $\env(t)$ --- show a linear increase
of the intensity measure for larger $\sca$ on a double-logarithmic scale.
Representations after logarithmic compression --- $\db(t)$, $\adapt(t)$, and
$c_i(t)$ --- are the first to reach a saturation regime and do so at
approximately the same $\sca$ because they are separated only by linear
transformations. Feature set $f_i(t)$ reaches a saturation regime, as well. But
contrary to previous results, the saturation point of $f_i(t)$ appears below
that of $c_i(t)$, which suggests that the second mechanism of thresholding and
temporal averaging can indeed improve intensity invariance beyond the first
mechanism of logarithmic compression and adaptation. The difference in
saturation points is best illustrated based on the ratio of each intensity
measure to the respective pure-noise reference
value~(Fig.\,\ref{fig:pipeline_full}d). However, compressing $f_i(t)$ into a
median across $k_i(t)$ conceils many kernel-specific details. It is therefore
necessary to consider the development of each $f_i(t)$ over $\sca$
separately~(Fig.\,\ref{fig:pipeline_full}c). Indeed, all 40 $f_i(t)$ in the set
reach a saturation regime for sufficiently large $\sca$. The saturated $\muf$
are distributed over a range of values --- which is the prerequisite for
forming species-specific combinations --- but are limited to a rather small
subset of possible values between 0 and 1. Based on previous
results~(Fig.\,\ref{fig:thresh-lp_single}f), this is likely due to the capping
of $\adapt(t)$ that prevents $f_i(t)$ from reaching its intrinsic saturation
value; but this cannot be confirmed until the following
analysis~(Fig.\,\ref{fig:pipeline_short}). Looking at the kernel-specific SNR
values of $c_i(t)$ over $\sca$~(Fig.\,\ref{fig:pipeline_full}e) and $f_i(t)$
over $\sca$~(Fig.\,\ref{fig:pipeline_full}f) reveals a high degree of variation
between different $k_i(t)$. Certain $f_i(t)$ achieve much higher SNR values
than $c_i(t)$ for the same $\sca$ due to the former's capacity for arbitrarily
low pure-noise responses ($\muf\to0$) and hence arbitrarily high SNR values.
Finally, the question remains whether the suspected improvement of intensity
invariance by $f_i(t)$ beyond $c_i(t)$ holds at the level of individual
$k_i(t)$. The single saturation points based on the median across $k_i(t)$ for
$c_i(t)$ and $f_i(t)$ are expanded into distributions of kernel-specific
saturation points~(Fig.\,\ref{fig:pipeline_full}g). For $c_i(t)$, the
distribution is rather narrow and corresponds well to the single saturation
point based on the median. For $f_i(t)$, however, the distribution is much
broader and is not centered around the single saturation point based on the
median but rather shifted towards lower $\sca$. Care must be taken when
interpreting the height of either distribution due to the logarithmic scaling
of the underlying $\sca$ axis. Nevertheless, the overall pattern suggests that
the saturation points of specific $f_i(t)$ are indeed lower than those of their
$c_i(t)$ counterparts. Therefore, the effect of thresholding and temporal
averaging on intensity invariance is not necessarily nullified by the previous
logarithmic compression and adaptation.
\begin{figure}[!ht]
\centering
\includegraphics[width=\textwidth]{figures/fig_invariance_full_Omocestus_rufipes.pdf}
\caption{\textbf{Step-wise emergence of intensity-invariant song
representations along the model pathway.}
Input $\raw(t)$ consists of $\soc(t)$ scaled by $\sca$
with added $\noc(t)$ and is processed up to the feature
set $f_i(t)$ using kernel-specific threshold values
$\thr=2\cdot\sigma_{\eta_i}$ (appendix
Fig.\,\ref{fig:app_full_kern-sd}). Different color shades
indicate different types of Gabor kernels with specific
lobe number $\kn$ and either $+$ or $-$ sign, sorted (dark
to light) first by increasing $\kn$ and then by
sign~($1\,\leq\,\kn\,\leq\,4$; first $+$, then $-$ for
each $\kn$; five kernel widths $\kw$ of 1, 2, 4, 8, and
$16\,$ms per type; 8 types, 40 $k_i(t)$ in total).
\textbf{a}:~Examples of $\filt(t)$, $\env(t)$, $\db(t)$,
$\adapt(t)$, $c_i(t)$, and $f_i(t)$ for different $\sca$.
\textbf{b}:~Intensity measures over $\sca$. The median
over $k_i(t)$ is shown for $c_i(t)$ and $f_i(t)$. Dots
indicate $95\,\%$ curve span for $\db(t)$, $\adapt(t)$,
$c_i(t)$, and $f_i(t)$.
\textbf{c}:~Average value $\muf$ of each $f_i(t)$
over $\sca$.
\textbf{d}:~Ratio of intensity measures from \textbf{b} to
the respective pure-noise reference for $\sca=0$.
\textbf{e}:~Ratio of standard deviation $\sigma_{c_i}$ of
each $c_i(t)$.
\textbf{f}:~Ratio of $\muf$.
\textbf{g}:~Distributions of kernel-specific $\sca$ that
correspond to $95\,\%$ curve span for $c_i(t)$ and
$f_i(t)$. Dots indicate values based on the median from
\textbf{b}.
}
\label{fig:pipeline_full}
\end{figure}
\FloatBarrier
\subsubsection{Excluding logarithmic compression}
The previous analysis was repeated in exactly the same way as before, except
that the logarithmic compression of $\env(t)$, Eq.\,\ref{eq:log}, was skipped
in order to disable the first mechanism of intensity invariance. Consequently,
$\adapt(t)$ is merely a highpass filtered version of $\env(t)$; and $\db(t)$ is
missing entirely~(Fig.\,\ref{fig:pipeline_short}a). As expected, all
representations prior to the thresholding nonlinearity $\nl$ --- $\filt(t)$,
$\env(t)$, $\adapt(t)$, and $c_i(t)$ --- show a linear increase of the
intensity measure for larger $\sca$, while $f_i(t)$ is the only representation
to reach a saturation regime~(Fig.\,\ref{fig:pipeline_short}bd). The
saturated $\muf$ are distributed over a much broader range of values than in
the previous analysis~(Fig.\,\ref{fig:pipeline_short}c). Intriguingly, the
distribution of $\muf$ is symmetric around a value of 0.5. This is relevant
because every kernel $k^+(t)$ in the underlying kernel set has a counterpart of
opposite sign that is otherwise identical, so that $k^+(t)=-k^-(t)$. The
responses of $k^+(t)$ and $k^-(t)$ to the same input $\adapt(t)$ are also
inverted because convolution is a linear operation: $c^+(t)=-c^-(t)$. The
distributions of $c^+(t)$ and $c^-(t)$ are hence inverted to each other, as
well: $p(c^+)=p(-c^-)$. Based on Eq.\,\ref{eq:feat_prop}, transforming $c^+(t)$
and $c^-(t)$ further using the same $\Theta$ thus results in two complementary
features $f^+(t)$ and $f^-(t)$ that are symmetric around 0.5, so that
$f^+(t)=1-f^-(t)$. Of course, this symmetry throughout the feature
representation goes hand in hand with a substantial degree of redundancy and is
hardly expected to be present in the actual grasshopper auditory system. But
the fact that the saturated $\muf$ are distributed symmetrically around 0.5
provides concrete evidence that each $f_i(t)$ is able to reach its intrinsic
saturation level in the absence of logarithmic
compression~(Fig.\,\ref{fig:pipeline_short}c), which is otherwise prevented by
the capping of $\adapt(t)$, as seen during previous
analyses~(Fig.\,\ref{fig:thresh-lp_single}f and
Fig.\,\ref{fig:pipeline_full}c). Otherwise, there appear to be no major
differences in the development of $f_i(t)$ over $\sca$ compared to the previous
analysis, neither on the kernel-specific SNR
values~(Fig.\,\ref{fig:pipeline_short}e) nor on the distribution of
kernel-specific saturation points~(Fig.\,\ref{fig:pipeline_short}f). Overall,
the most substantial consequence of skipping the logarithmic compression is
that it allows $f_i(t)$ to reach its intrinsic saturation value. If this
results in a wider range of $\muf$ across the feature set, it should be
benefitial for forming species-specific combinations. However, this depends on
multiple different factors such as the choice of $k_i(t)$ and $\thr$ as well as
the structure and distribution of the specific song and is hence not guaranteed
simply by disabling logarithmic compression.
\begin{figure}[!ht]
\centering
\includegraphics[width=\textwidth]{figures/fig_invariance_short_Omocestus_rufipes.pdf}
\caption{\textbf{Effects of disabling logarithmic compression on intensity
invariance along the model pathway.}
Same input and processing as in
Fig.\,\ref{fig:pipeline_full}, using kernel-specific
threshold values $\thr=2\cdot\sigma_{\eta_i}$ (appendix
Fig.\,\ref{fig:app_short_kern-sd}), except that
logarithmic compression and hence $\db(t)$ are skipped.
\textbf{a}:~Examples of $\filt(t)$, $\env(t)$,
$\adapt(t)$, $c_i(t)$, and $f_i(t)$ for different $\sca$.
\textbf{b}:~Intensity measures over $\sca$. The median
over $k_i(t)$ is shown for $c_i(t)$ and $f_i(t)$. Dot
indicates $95\,\%$ curve span for $f_i(t)$.
\textbf{c}:~Average value $\muf$ of each $f_i(t)$
over $\sca$.
\textbf{d}:~Ratio of intensity measures from \textbf{b} to
the respective pure-noise reference for $\sca=0$.
\textbf{e}:~Ratio of $\muf$.
\textbf{f}:~Distribution of kernel-specific $\sca$ that
correspond to $95\,\%$ curve span for $f_i(t)$. Dot
indicates value based on the median from \textbf{b}.
}
\label{fig:pipeline_short}
\end{figure}
\FloatBarrier
\subsubsection{Intensity invariance in a naturalistic setting}
% This one appears...meh?
So far, the analyses on intensity invariance were based on synthetically
generated input signals, since these allow for a systematic manipulation of the
mixture of song component $\soc(t)$ and noise component $\noc(t)$ over an
arbitrary range of scales $\sca$. Now, the question remains how the model
pathway performs under more naturalistic conditions. The previous analysis of
the full model pathway~(Fig.\,\ref{fig:pipeline_full}) was hence repeated,
using field recordings of a song of \textit{P. parallelus} as input $\raw(t)$
and a segment of background noise from the same recordings as pure-noise
reference. Recordings were taken simultaneously at eight different distances
$d$ from the sender, ranging from $10\,$cm to $220\,$cm with intervals of
$30\,$cm between microphones. The precise value of $\sca$ that corresponds to a
given $d$ cannot be determined in a straightforward manner, but $\sca$ is
expected to be inversely proportional to $d$ based on the inverse-square law of
sound propagation. All intensity measures and ratios thereof were hence plotted
over $1/d$ on a double-logarithmic scale, which is insofar comparable to
previous analyses that a decade on the $1/d$ axis corresponds to a decade on
the $\sca$ axis. To complicate matters further, the $1/d$ axis is sampled too
sparsely to determine saturation points as before based on the $95\,\%$ curve
span. Instead, one has to rely on the slope of the curve to assess if, and at
which $1/d$, a given representation reaches a saturation regime. Bearing these
limitations in mind, the intensity measures of each representation over
$1/d$~(Fig.\,\ref{fig:pipeline_field}b) follow a pattern that is consistent
with the results of the previous simulation-based
analysis~(Fig.\,\ref{fig:pipeline_full}b): The standard deviations of
$\filt(t)$ and $\env(t)$ increase linearly with $1/d$, respectively. The
standard deviations of $\db(t)$, $\adapt(t)$, and $c_i(t)$ show a weaker
increase with $1/d$ and appear to approach, but not reach, a saturation regime
for larger $1/d$. The average feature values $\muf$ of $f_i(t)$ show an even
weaker increase with $1/d$ and appear to reach a saturation regime for
$d=40\,$cm and $d=10\,$cm, which is consistent across most $f_i(t)$ in the
set~(Fig.\,\ref{fig:pipeline_field}c). Saturation of $f_i(t)$ without
saturation of $c_i(t)$ suggests that the input $\raw(t)$ at the smallest
$d=10\,$cm corresponds to a value of $\sca$ between 10 and 20 based on
comparison with the simulation-based analysis~(Fig.\,\ref{fig:pipeline_full}b).
The saturated $\muf$ are distributed over a comparably narrow range of values,
which could in parts be a property of the songs of \textit{P. parallelus}~(see
also Fig.\,\ref{fig:thresh-lp_species}bc). The ratios of each intensity measure
to the respective pure-noise reference value are not aligned across
representations~(Fig.\,\ref{fig:pipeline_field}d) or
kernels~(Fig.\,\ref{fig:pipeline_field}ef) but serve to consolidate the
previous observation that only $f_i(t)$ exhibits some degree of intensity
invariance within the available range of $1/d$. Based on the current results,
this intensity invariance of $f_i(t)$ in the field holds up to a distance of
around $40\,$cm from the sender, decays steadily between $40\,$cm and
$130\,$cm, and is substantially dimished for larger
distances~(Fig.\,\ref{fig:pipeline_field}a, bottom row).
\begin{figure}[!ht]
\centering
\includegraphics[width=\textwidth]{figures/fig_invariance_field.pdf}
\caption{\textbf{Intensity invariance along the model pathway in a
naturalistic setting.}
Input $\raw(t)$ consists of a song of \textit{P.
parallelus} recorded in the field at eight different
distances $d$ and is processed up to the feature set
$f_i(t)$ using kernel-specific threshold values
$\thr=2\cdot\sigma_{\eta_i}$ (appendix
Fig.\,\ref{fig:app_field_kern-sd}). Different color shades
indicate different types of Gabor kernels with specific
lobe number $\kn$ and either $+$ or $-$ sign, sorted (dark
to light) first by increasing $\kn$ and then by
sign~($1\,\leq\,\kn\,\leq\,4$; first $+$, then $-$ for
each $\kn$; five kernel widths $\kw$ of 1, 2, 4, 8, and
$16\,$ms per type; 8 types, 40 $k_i(t)$ in total).
\textbf{a}:~$\filt(t)$, $\env(t)$, $\db(t)$, $\adapt(t)$,
$c_i(t)$, and $f_i(t)$ at each $d$. A noise segment from
the same recording is shown for reference.
\textbf{b}:~Intensity measures over $d$. The median over
$k_i(t)$ is shown for $c_i(t)$ and $f_i(t)$.
\textbf{c}:~Average value $\muf$ of each $f_i(t)$ over
$d$.
\textbf{d}:~Ratio of intensity measures from \textbf{b} to
the respective value obtained from the noise reference.
\textbf{e}:~Ratio of standard deviation $\sigma_{c_i}$ of
each $c_i(t)$.
\textbf{f}:~Ratios of $\muf$.
}
\label{fig:pipeline_field}
\end{figure}
\FloatBarrier
\subsection{Interspecific and intraspecific feature variability}
In the final analysis of the current study, we investigated the variability of
songs in the feature representation between different species and within the
same species~(Fig.\,\ref{fig:feat_cross_species}). Naturally, a feature
representation that is both consistent across different songs of the same
species and sufficiently different between songs of different species is a
fundamental prerequisite for species-specific song recognition. The data used
in this analysis corresponds to the saturated $\muf$ of each $f_i(t)$ from the
previous analysis of the full model pathway~(Fig.\,\ref{fig:pipeline_full}c),
using different songs of \textit{O. rufipes} for the intraspecific comparisons
and single songs from a number of species for the interspecific comparisons
(also shown in Fig.\,\ref{fig:thresh-lp_species}a). Accordingly, each song is
represented by 40 values of $\muf$ based on the same set of $f_i(t)$. For each
comparison, $\muf$ from one song was plotted against $\muf$ from the other
song, so that each dot within a subplot corresponds to a single feature
$f_i(t)$. For the intraspecific
comparisons~(Fig.\,\ref{fig:feat_cross_species}, upper triangular), the pairs
of $\muf$ are distributed closely around the diagonal, with a minimum
correlation coefficient of $\rho=0.82$, a maximum of $\rho=0.99$, and a median
of $\rho=0.91$. A given $f_i(t)$ thus tends to have a similar $\muf$ across
different songs of the same species. In contrast, the pairs of $\muf$ for the
interspecific comparisons~(Fig.\,\ref{fig:feat_cross_species}, lower
triangular) are distributed in a variety of different ways, most in broader
clouds (e.g. \textit{C. biguttulus} vs. \textit{C. mollis}) but some more
narrowly around the diagonal (e.g. \textit{P. parallelus} vs. \textit{C.
dispar}). The correlation coefficients $\rho$ vary widely between different
interspecific comparisons, with a minimum of $\rho=-0.1$, a maximum of
$\rho=0.91$, and a median of $\rho=0.40$. A given $f_i(t)$ therefore tends to
have a less similar $\muf$ across different species than within the same
species, although certain exeptions exist~(Fig.\,\ref{fig:feat_cross_species},
lower right). Accordingly, the feature representation that is generated by the
model pathway is, in principle, suitable for the distinction between different
species-specific songs. However, even the songs of the same species are subject
to considerable variability in various aspects and depending on a multitude of
external and internal factors, which cannot be fully captured based on a
limited number of songs. The results of the current analysis are hence to be
treated as a proof-of-concept that paves the way towards more comprehensive
investigations on the details of song representation in feature space,
including the effects of different parameters of the model pathway as well as
the inclusion of additional songs and species to reflect the complexity of
natural song variation.
\begin{figure}[!ht]
\centering
\includegraphics[width=\textwidth]{figures/fig_features_cross_species.pdf}
\caption{\textbf{Interspecific and intraspecific feature variability.}
Average value $\muf$ of each feature $f_i(t)$ against its
counterpart from a 2nd feature set based on a different
input $\raw(t)$. Data is based on the saturated $\muf$
from Fig.\,\ref{fig:pipeline_full}. Each dot within a
subplot represents a single $f_i(t)$. Different color
shades indicate different types of Gabor kernels with
specific lobe number $\kn$ and either $+$ or $-$ sign,
sorted (dark to light) first by increasing $\kn$ and then
by sign~($1\,\leq\,\kn\,\leq\,4$; first $+$, then $-$ for
each $\kn$; five kernel widths $\kw$ of 1, 2, 4, 8, and
$16\,$ms per type; 8 types, 40 kernels in total).
\textbf{Lower triangular}:~Interspecific comparisons
between single songs of different species.
\textbf{Upper triangular}:~Intraspecific comparisons
between different songs of a single species (\textit{O.
rufipes}).
\textbf{Lower right}:~Distribution of correlation
coefficients $\rho$ for each interspecific and
intraspecific comparison. Dots indicate single $\rho$
values.\\
}
\label{fig:feat_cross_species}
\end{figure}
\FloatBarrier
\newpage
\section{Discussion}
In the current study, we have established a physiologically inspired functional
model of the grasshopper song recognition pathway. The model pathway covers the
entire auditory processing stream, from the sound reception at the tympanal
membrane over peripheral receptor neurons and local interneurons up to the
generation of a high-dimensional feature representation at the level of the
ascending neurons and beyond in the SEG. Using this model pathway, we have
identified two computational key mechanisms for the emergence of
intensity-invariant song representations. Each mechanism comprises a nonlinear
transformation and a subsequent linear transformation. The first mechanism
consists of logarithmic compression and adaptation, which takes place at the
level of the receptor neurons and local interneurons. The second mechanism
consists of thresholding and temporal averaging, which takes place either at
the level of the ascending neurons or further downstream in the SEG. Systematic
investigation of both mechanisms revealed a persistent trade-off between the
intensity invariance and the SNR of the song representations along the pathway.
In the following, we discuss the capabilities and limitations of our model
approach as well as the implications of our findings for the design of the
grasshopper auditory system, the evolution of species-specific grasshopper
songs, and the ethological relevance of intensity invariance in a natural
acoustic environment.
\subsection{Leveraging functional modelling to investigate sensory systems}
Our understanding of sensory processing systems is based on the distributed
accumulation of anatomical, physiological, and ethological evidence. Functional
modelling provides a powerful tool to integrate the available fragments into a
coherent whole. It fasciliates systematic, reproducible investigations of
relevant parameters such as scale $\sca$ or threshold value $\thr$. Moreover,
it allows to address questions of broader scope by generalizing from concrete
evidence. For instance, the interaction between the two mechanisms of intensity
invariance is most assessible if both mechanisms can be treated as consecutive
stages along the pathway --- where the output of the first stage relates
directly to the input of the second stage --- rather than separate entities.
The model pathway also provides a general basis for comparing song
representations across different species without the need for species-specific
models. However, the potential of functional modelling for research on sensory
systems depends entirely on the amount of available knowledge about the system.
The grasshopper song recognition pathway is a comparably simple and very
well-understood system and is therefore a particularly suitable candidate for
functional modelling. Other sensory systems that are either more complex or
have not been subject to decades of study will likely not be suitable for this
approach yet.
\subsection{Feature representation, temporal averaging, and song design}
\label{sec:constant_feat}
The feature set is the final song representation along the model pathway and
constitutes the basis for song recognition. Each feature $f_i(t)$ results from
the thresholding of the respective kernel response $c_i(t)$ by $\nl$ and the
subsequent temporal averaging of binary response $b_i(t)$ by a lowpass filter
with extremely low cutoff frequency $\fc$. At a given time point $t$, $f_i(t)$
approximately quantifies the proportion of time during which $c_i(t)$ exceeds
the threshold value $\thr$ within the averaging interval $\tlp$ specified by
$\fc$. The value of $f_i(t)$ is hence determined by $\thr$ with respect to the
distribution $\pci$ of $c_i(t)$ and is restricted to the interval $[0,1]$.
Different species-specific songs are represented by different combinations of
feature values, which should preferably be constant for the duration of a song
to enable reliable recognition. The fundamental requirement for a constant
$f_i(t)$ is that the time where $c_i(t)>\thr$ during $\tlp$ is the same for all
$t$, which is fulfilled if $\pci$ is stable across $t$. The most
straightforward way to achieve a stable $\pci$ is that $c_i(t)$ is periodic and
$\tlp$ is sufficiently long to average over multiple cycles of $c_i(t)$.
Song-evoked $c_i(t)$ are indeed approximately periodic, which is largely an
inherited property of the song itself. Most grasshopper songs are produced by
stridulation, which refers to the pulling of the serrated stridulatory file on
the hindlegs across a resonating vein on the
forewings~(\bcite{helversen1977stridulatory}; \bcite{stumpner1994song};
\bcite{helversen1997recognition}). Every "tooth" that strikes the vein
generates a brief sound pulse; multiple pulses make up a syllable; and the
repetition of syllables and pauses results in a pattern with a high degree of
temporal regularity. Accordingly, a robust feature representation in the sense
of constant $f_i(t)$ is tightly linked to the mechanism of sound production and
the temporal structure of the generated song.
Various grasshopper species, especially those with longer songs like \textit{C.
mollis}, \textit{G. rufus}, or \textit{O. rufipes}, tend to stridulate softly
at first and then continuously increase the amplitude of their song over time.
This slow "ramping" amplitude modulation makes the overall song less periodic
despite its temporal regularity. The "ramping" appears more pronounced in
$\env(t)$ compared to $\adapt(t)$, which suggests that the logarithmic
compression and adaptation during the preprocessing stage might be at least
partially beneficial for mitigating the effect of this amplitude modulation on
later representations. However, the adaptation of $\adapt(t)$ can only act on
certain time scales --- depending on the cutoff frequency of the underlying
highpass filter --- and is hence not able to compensate for "ramping" across
the entire duration of a song.
Certain grasshopper species like \textit{Chorthippus dorsatus} are known to
switch their stridulation pattern in the middle of a
song~(\bcite{stumpner1994song}). \textit{C. dorsatus} starts stridulating with
both hindlegs in synchrony and thereby generates a pronounced syllable-pause
pattern similar to that of \textit{P. parallelus}. For the last part of its
song, however, \textit{C. dorsatus} switches to an alternating leg movement,
which results in a more continuous but not entirely unstructured rattling
sound. It is unclear what this composite design means for the feature
representation of \textit{C. dorsatus} songs. In principle, both parts of the
song could result in similar $\pci$ despite their different temporal structure,
which would allow for consistent $f_i(t)$ across the entire song. However, it
appears more likely that only one part of the song encodes species identity,
while the other part serves a different purpose such as fitness
advertisement~(SOURCE?).
Finally, the question remains how the choice of an appropriate averaging
interval $\tlp$ depends on the duration and temporal structure of a song. The
minimum $\tlp$ should encompass at least a few cycles of $c_i(t)$ to ensure a
stable $\pci$ and hence a constant $f_i(t)$. The maximum $\tlp$ should not
exceed the duration of a song to avoid the inclusion of behaviorally irrelevant
information. The longer $\tlp$, the longer $f_i(t)$ takes to stabilize after
the onset and before the offset of a song, which narrows the time window for
reliable recognition. The duration of species-specific grasshopper songs can
range from a few hundred milliseconds (e\,.g \textit{Stethophyma grossum}) to
well over a minute (e\,.g. \textit{C. mollis}), so that the optimal $\tlp$ is
likely to differ between species.
\subsection{Sensory invariances in the grasshopper auditory system}
The notion of invariance is fundamental for sensory processing systems.
Invariance, in the general sense, can be described as the property of a
transformation to maintain variation across certain meaningful input parameters
in its output while discarding variation across other input parameters. This
boils down to a selective input-output decorrelation that allows the system to
represent only those aspects of the stimulus that are behaviorally relevant to
the organism.
The grasshopper auditory system has to deal with a number of sources of
non-informative song variation. For instance, the temporal structure of the
song pattern warps with temperature~(\bcite{skovmand1983song}). This also
affects certain structural parameters that are essential for song recognition,
mainly the duration of syllables and pauses. The auditory system can compensate
for this variation by reading out relative temporal relationships rather than
absolute time intervals~(\bcite{creutzig2009timescale};
\bcite{creutzig2010timescale}). The ratio of syllable duration to pause
duration is relatively constant across temperatures and has been shown to be
suitable for song recognition~(\bcite{helversen1972gesang}), so that there is
likely no need to retain any information about the absolute duration of
syllables and pauses.
The situation is more complex for variations in song intensity. Song intensity
at the receiver's position depends mostly on the distance to the sender and is
hence not a reliable cue to infer species identity. The auditory system should
therefore be invariant to intensity variations to recognize conspecific songs
regardless of sender distance. However, song intensity --- specifically, the
interaural intensity difference --- is also required for directional hearing,
which is essential for phonotaxis~(\bcite{helversen1988interaural}). Conflicts
between song recognition and directional hearing are avoided in the auditory
system by distributing both functions across two parallel
pathways~(\bcite{helversen1984parallel}; \bcite{ronacher1986routes}). This is
the main reason why our model pathway is focused entirely on song recognition
and has no capacity for directional hearing, no matter how relevant it may be
to the grasshopper.
Furthermore, "invariance to variations in song intensity" does not do justice
to the full extent of the problem. Intensity is a function of song amplitude
within a certain time frame. It can refer to the individual syllables and
pauses of the song pattern as well as the entire song --- the former is
relevant for song recognition, while the latter is not. Intensity invariance in
the current context can therefore be described as time scale-selective
sensitivity to the faster amplitude dynamics of the song pattern and
simultaneous insensitivity to slower, more sustained amplitude dynamics. In the
model pathway, this time scale selectivity is reflected by the cutoff frequency
$\fc$ of the highpass filter that underlies the adaptation of $\adapt(t)$: Most
$\fc$ are effective in removing the local offset of $\db(t)$ and render
$\adapt(t)$ intensity-invariant, but only sufficiently low $\fc$ will leave the
relevant amplitude dynamics of the song pattern intact.
\subsection{Intensity invariance versus SNR}
Each processing step along the model pathway is a transformation between input
representation and output representation. The intensity of the input is
characterized by scale $\sca$. The intensity of the output is characterized by
an appropriate intensity measure. If the transformation renders the output more
intensity-invariant, then the intensity measure will saturate for sufficiently
large $\sca$, which caps the output SNR to a constant value across these
$\sca$. Otherwise, the intensity measure and hence the output SNR will increase
monotonically with $\sca$. The trade-off between intensity invariance and SNR
refers to the principle that a transformation can either improve intensity
invariance or maintain SNR --- it cannot do both at the same time. This
principle is presumably not specific to the two mechanisms along the model
pathway but rather a general property of transformations that equalize between
different input intensities.
Logarithmic compression and adaptation by highpass filtering is capable of
equalizing a wide range of $\sca$. In the absence of noise component $\noc(t)$,
output $\adapt(t)$ is a perfectly intensity-invariant representation of song
component $\soc(t)$ across all $\sca>0$. However, the presence of $\noc(t)$
limits the effectiveness of this mechanism to sufficiently large $\sca$. This
means that intensity invariance and SNR interact at the input level, as well.
Specifically, the saturation point of $\adapt(t)$ is determined by the input
SNR of $\env(t)$, which in turn depends on the initial SNR of the sound signal
$\raw(t)$. This initial SNR is presumably improved by the bandpass filtering of
$\raw(t)$ into $\filt(t)$ at the tympanal membrane, which attenuates
frequencies outside the relevant range of grasshopper songs. The SNR is then
further improved by the rectification and lowpass filtering of $\filt(t)$ into
$\env(t)$. This improvement depends on the cutoff frequency $\fc$ of the
lowpass filter --- the lower $\fc$, the higher the SNR of $\env(t)$ at a given
$\sca$. However, $\fc$ must not be too low to avoid the attenuation of relevant
amplitude dynamics of the song pattern. The saturation level of $\adapt$,
unlike its saturation point, is independent of the SNR of $\env(t)$ because the
influence of $\noc(t)$ is negligible for sufficiently large $\sca$. The output
SNR of $\adapt(t)$ saturates at a comparably low value of around 10. This might
in parts be a consequence of the logarithm, which compresses different higher
intensities but also amplifies lower intensities, including the noise floor.
Both the saturation level and the saturation point of $\adapt(t)$ vary between
different species and individual songs. These differences are likely rooted in
the way in which logarithmic compression acts on the specific distribution of
$\env(t)$, which is determined by $\fc$ as well as the temporal structure and
frequency spectrum of the rectified $\filt(t)$.
Thresholding and temporal averaging renders feature $f_i(t)$
intensity-invariant for sufficiently large $\sca$. The trade-off between
intensity invariance and SNR is mediated by threshold value $\thr$. A lower
$\thr$ ($\thr\to0$) improves intensity invariance by shifting the saturation
point towards lower $\sca$ but also decreases the SNR of $f_i(t)$. The
saturation level of $f_i(t)$ is independent of $\thr$ as long as the intensity
invariance by the previous mechanism is neglected. The SNR of $f_i(t)$ is
therefore determined solely by the pure-noise response of $f_i(t)$. The
distribution $\pci$ of the pure-noise kernel response $c_i(t)$ is largely a
normal distribution with mean $\mu\approx0$ for all kernels $k_i(t)$. The value
of the pure-noise $f_i(t)$ is hence 0.5 for $\thr=0$ and decreases for higher
$\thr$. If $\thr$ is set above the maximum of $c_i(t)$, the pure-noise feature
value is 0, which results in an "unlimited" SNR of $f_i(t)$. In this case, any
non-zero feature value that is sustained for a sufficient duration could serve
as indicator for the presence of $\soc(t)$, although at the cost of a higher
saturation point. The maximum of the pure-noise $c_i(t)$ is assumed to be very
small due to the various SNR improvements along the pathway, so that the
required increase in $\thr$ and hence the saturation point of $f_i(t)$ is not
expected to be substantial. However, exploiting the capacity of $f_i(t)$ for
arbitrarily high SNR would certainly require a fine evolutionary tuning of
$\thr$ to the properties of both the species-specific song and the natural
noise in a certain habitat.
\newpage
\subsection{Intensity invariance versus intensity invariance}
Two consecutive mechanisms of intensity invariance do not necessarily add up to
a stronger overall intensity invariance. If the first mechanism results in a
lower saturation point than the second mechanism by itself, the saturation
point of feature $f_i(t)$ will be determined solely by the first mechanism. In
this case, the saturation level of $f_i(t)$ will conform to the intensity that
$f_i(t)$ can reach for the given saturation point rather than the intrinsic
saturation level of $f_i(t)$. Conversely, if the second mechanism results in a
lower saturation point than the first mechanism, both the saturation point and
the saturation level of $f_i(t)$ will be determined by the second mechanism.
The saturation points of $f_i(t)$ across the set are distributed over a much
wider range than those of the preceeding kernel responses $c_i(t)$, which
suggests that the interaction between the two mechanisms is specific to
individual kernels $k_i(t)$. A number of $f_i(t)$ achieve a lower saturation
point than the respective $c_i(t)$, while some $f_i(t)$ exhibit similar or only
marginally lower saturation points. This raises the question whether two
consecutive mechanisms of intensity invariance are actually beneficial for the
overall system.
From a purely functional perspective, the answer could be that logarithmic
compression and adaptation is a necessary preprocessing step towards a robust
feature representation, even if thresholding and temporal averaging alone would
be sufficient to render $f_i(t)$ intensity-invariant. This preprocessing likely
improves the temporal regularity of the song pattern in $\adapt(t)$ and
$c_i(t)$, which is required for constant $f_i(t)$ across the duration of a
song~(Section\,\ref{sec:constant_feat}). It also ensures consistency between
the distribution $\pci$ of $c_i(t)$ across songs of different intensity, which
is essential for the generation of consistent species-specific $f_i(t)$ under a
static $\thr$. From a physiological perspective, the answer is likely that
neurons possess only a limited firing rate for encoding stimulus intensities
that can range over several orders of magnitude. Sigmoidal tuning curves over
logarithmically compressed stimulus intensities are a common property of
sensory neurons across various modalities~(SOURCE?), and neurons of the
grasshopper auditory system are no exception~(\bcite{suga1960peripheral};
\bcite{gollisch2002energy}).
\subsection{Implications for behavior in a natural acoustic environment}
Most grasshoppers live in environments that are communally inhabited by
numerous individuals from multiple species. Their acoustic environment is
characterized by noise from various sources --- abiotic ones like wind and
water, but also the songs of both hetero- and conspecifics. This limits the SNR
that each individual can achieve for its own song, and hence the effectiveness
of the intensity-invariant processing in the auditory system. Producing higher
song intensities is not a viable solution to this problem, because these also
contribute to the overall noise floor. A possible behavioral solution could be
to produce songs in a "turn-taking" manner to avoid the temporal superposition
of multiple songs into overly intense signals. This would also prevent the
mutual distortion of the respective song pattern. Another solution could be to
spatially separate from other nearby grasshoppers to spread the potential noise
sources over a larger area. However, according to our analysis based on field
recordings as well as previous work on the topic~(\bcite{lang2000acoustic}),
reliable song recognition is limited to little more than 1\,m from the sender,
so that a grasshopper also cannot afford to stay too far away from its
conspecifics. A better solution may hence be to collectively produce songs at
lower-than-possible intensities, which would reduce the overall noise floor for
all nearby individuals. Importantly, the limitation of intensity invariance by
SNR likely applies to all grasshoppers regardless of species, so that the
behavioral strategies could be shared among the species that coexist in a given
habitat.
% Because the presumed restriction of song recognition
% by means of the noise floor applies to all grasshoppers in a certain area,
% these strategies may not be specific to some of the species at this location.
% Instead, they must be shared by all grasshopper species that coexist within a
% portion of a given habitat, which would provide an important implication for
% the evolution of grasshopper songs in communities of multiple species.
%%% RELICS OF INTRODUCTION %%%
% - Nonlinear operations can be used to detach representations from graded physical
% stimulus (to fasciliate categorical behavioral decision-making?):\\
% 1) Capture sufficiently precise amplitude information: $\env(t)$, $\adapt(t)$\\
% $\rightarrow$ Closely following the AM of the acoustic stimulus\\
% 2) Quantify relevant stimulus properties on a graded scale: $c_i(t)$\\
% $\rightarrow$ More decorrelated representation, compared to prior stages\\
% 3) Nonlinearity: Distinguish between "relevant vs irrelevant" values: $b_i(t)$\\
% $\rightarrow$ Trading a graded scale for two or more categorical states\\
% 4) Represent stimulus properties under relevance constraint: $f_i(t)$\\
% $\rightarrow$ Graded again but highly decorrelated from the acoustic stimulus\\
% 5) Categorical behavioral decision-making requires further nonlinearities\\
% $\rightarrow$ Parameters of a behavioral response may be graded (e.g. approach speed),
% initiation of one behavior over another is categorical (e.g. approach/stay)
% Multi-species, multi-individual communally inhabited environments\\
% - Temporal overlap: Simultaneous singing across individuals/species common\\
% - Frequency overlap: Little speciation into frequency bands (likely unused)\\
% - "Biotic noise": Hetero-/conspecifics ("Another one's songs are my noise")\\
% - "Abiotic noise": Wind, water, vegetation, anthropogenic\\
% - Effects of habitat structure on sound propagation (landscape - soundscape)\\
% $\rightarrow$ Sensory constraints imposed by the (acoustic) environment
% Cluster of auditory challenges (interlocking constraints $\rightarrow$ tight coupling):\\
% From continuous acoustic input, generate neuronal representations that...\\
% 1)...allow for the separation of relevant (song) events from ambient noise floor\\
% 2)...compensate for behaviorally non-informative song variability (invariances)\\
% 3)...carry sufficient information to characterize different song patterns,
% recognize the ones produced by conspecifics, and make appropriate behavioral
% decisions based on context (sender identity, song type, mate/rival quality)
% How can a human observer conceive a grasshopper's auditory percepts?\\
% - How to investigate the workings of the auditory pathway as a whole?\\
% - How to systematically test effects and interactions of processing parameters?\\
% - How to integrate the available knowledge on anatomy, physiology, ethology?\\
% $\rightarrow$ Abstract, simplify, formalize $\rightarrow$ Functional model framework
\newpage
\section{Appendix}
% Not sure if we really need this one. Might raise more questions than it
% provides answers. The noise component is not stable throughout nonlinear
% transformations, that is all the reader needs to know, i believe.
\begin{figure}[!ht]
\centering
\includegraphics[width=\textwidth]{figures/fig_noise_env_sd_conversion_appendix.pdf}
\caption{\textbf{Conversion of the noise component by envelope extraction.}
Standard deviation $\sigma_{\eta}$ of noise component
$\noc(t)$ within the signal envelope $\env(t)$ over scale
$\sca$. Based on input $\raw(t)$ with $\sigma_{\eta}=1$
(corresponding to the analysis underlying
Fig.\,\ref{fig:rect-lp}), using 100 random realizations of
$\noc(t)$.}
\label{fig:app_env-sd}
\end{figure}% Referenced.
\FloatBarrier
\begin{figure}[!ht]
\centering
\includegraphics[width=\textwidth]{figures/fig_invariance_rect-lp_appendix.pdf}
\caption{\textbf{Species-specific data underlying Fig.\,\ref{fig:rect-lp}e.}
Ratio of the standard deviation $\sigma_{\text{env}}$ to
the pure-noise reference $\sigma_{\eta}$ of the signal
envelope $\env(t)$ over scale $\sca$ for different cutoff
frequencies $\fc$ of the lowpass filter extracting
$\env(t)$. Solid lines and shaded areas indicate mean
$\pm$ standard deviation across songs per recording.
Dashed lines indicate mean across recordings (shown in
Fig.\,\ref{fig:rect-lp}e).}
\label{fig:app_rect-lp}
\end{figure}% Referenced.
\FloatBarrier
\begin{figure}[!ht]
\centering
\includegraphics[width=\textwidth]{figures/fig_invariance_log-hp_appendix.pdf}
\caption{\textbf{Species-specific data underlying Fig.\,\ref{fig:log-hp}e.}
Ratio of the standard deviation $\sigma_{\text{adapt}}$ to
the pure-noise reference $\sigma_{\eta}$ of the
intensity-adapted envelope $\adapt(t)$ over scale $\sca$.
Solid lines and shaded areas indicate mean $\pm$ standard
deviation across songs per recording. Dashed lines
indicate mean across recordings (shown in
Fig.\,\ref{fig:log-hp}e).}
\label{fig:app_log-hp_curves}
\end{figure}% Referenced.
\FloatBarrier
\begin{figure}[!ht]
\centering
\includegraphics[width=\textwidth]{figures/fig_saturation_log-hp_appendix.pdf}
\caption{\textbf{Species-specific saturation points underlying
Fig.\,\ref{fig:log-hp}e.}
Distribution of saturation points ($95\,\%$ curve span) of
ratio $\sigma_{\text{adapt}} / \sigma_{\eta}$ of the
intensity-adapted envelope $\adapt(t)$ over scale $\sca$
across all available songs. Dots indicate the saturation
point of the mean curve across songs and recordings (shown
in Fig.\,\ref{fig:log-hp}e, see also appendix
Fig.\,\ref{fig:app_log-hp_curves}).}
\label{fig:app_log-hp_saturation}
\end{figure}% Referenced.
\FloatBarrier
\begin{figure}[!ht]
\centering
\includegraphics[width=\textwidth]{figures/fig_invariance_thresh-lp_pure_appendix.pdf}
\caption{\textbf{Species-specific data underlying Fig.\,\ref{fig:thresh-lp_species}bd.}
Average value $\muf$ of each of the three features
$f_i(t)$ over scale $\sca$ in the noiseless case. Solid
lines and shaded areas indicate mean $\pm$ standard
deviation across songs per recording. Dashed lines
indicate mean across recordings (shown in
Fig.\,\ref{fig:thresh-lp_species}bd).}
\label{fig:app_thresh-lp_pure}
\end{figure}% Referenced.
\FloatBarrier
\begin{figure}[!ht]
\centering
\includegraphics[width=\textwidth]{figures/fig_invariance_thresh-lp_noise_appendix.pdf}
\caption{\textbf{Species-specific data underlying Fig.\,\ref{fig:thresh-lp_species}ce.}
Average value $\muf$ of each of the three features
$f_i(t)$ over scale $\sca$ in the noisy case. Solid lines
and shaded areas indicate mean $\pm$ standard deviation
across songs per recording. Dashed lines indicate mean
across recordings (shown in
Fig.\,\ref{fig:thresh-lp_species}ce).}
\label{fig:app_thresh-lp_noise}
\end{figure}% Referenced.
\FloatBarrier
\begin{figure}[!ht]
\centering
\includegraphics[width=\textwidth]{figures/fig_kernel_sd_perc_thresh_lp_appendix.pdf}
\caption{\textbf{Relation between threshold value and pure-noise feature
value for Fig.\,\ref{fig:thresh-lp_single} and
Fig.\,\ref{fig:thresh-lp_species}.}
Proportion of pure-noise kernel response $c_i(t)$ that
exceeds threshold value $\thr$ --- which determines the
average value $\muf$ of feature $f_i(t)$ --- over $\thr$
in multiples of standard deviation $\sigma_{c_i}$.
Corresponds to a "reverse" cumulative distribution
function of $c_i(t)$. Black solid lines indicate rCDF per
kernel $k_i(t)$. Red dashed line indicates rCDF for a
normal distribution with $\mu=0$ and $\sigma=1$.
}
\label{fig:app_thresh-lp_kern-sd}
\end{figure}% Referenced.
\FloatBarrier
\begin{figure}[!ht]
\centering
\includegraphics[width=\textwidth]{figures/fig_kernel_sd_perc_full_appendix.pdf}
\caption{\textbf{Relation between threshold value and pure-noise feature
value for Fig.\,\ref{fig:pipeline_full}.}
Proportion of pure-noise kernel response $c_i(t)$ that
exceeds threshold value $\thr$ --- which determines the
average value $\muf$ of feature $f_i(t)$ --- over $\thr$
in multiples of standard deviation $\sigma_{c_i}$.
Corresponds to a "reverse" cumulative distribution
function of $c_i(t)$. Black solid lines indicate rCDF per
kernel $k_i(t)$. Red dashed line indicates rCDF for a
normal distribution with $\mu=0$ and $\sigma=1$.
}
\label{fig:app_full_kern-sd}
\end{figure}% Referenced.
\FloatBarrier
\begin{figure}[!ht]
\centering
\includegraphics[width=\textwidth]{figures/fig_kernel_sd_perc_short_appendix.pdf}
\caption{\textbf{Relation between threshold value and pure-noise feature
value for Fig.\,\ref{fig:pipeline_short}.}
Proportion of pure-noise kernel response $c_i(t)$ that
exceeds threshold value $\thr$ --- which determines the
average value $\muf$ of feature $f_i(t)$ --- over $\thr$
in multiples of standard deviation $\sigma_{c_i}$.
Corresponds to a "reverse" cumulative distribution
function of $c_i(t)$. Black solid lines indicate rCDF per
kernel $k_i(t)$. Red dashed line indicates rCDF for a
normal distribution with $\mu=0$ and $\sigma=1$.
}
\label{fig:app_short_kern-sd}
\end{figure}% Referenced.
\FloatBarrier
\begin{figure}[!ht]
\centering
\includegraphics[width=\textwidth]{figures/fig_kernel_sd_perc_field_appendix.pdf}
\caption{\textbf{Relation between threshold value and pure-noise feature
value for Fig.\,\ref{fig:pipeline_field}.}
Proportion of pure-noise kernel response $c_i(t)$ that
exceeds threshold value $\thr$ --- which determines the
average value $\muf$ of feature $f_i(t)$ --- over $\thr$
in multiples of standard deviation $\sigma_{c_i}$.
Corresponds to a "reverse" cumulative distribution
function of $c_i(t)$. Black solid lines indicate rCDF per
kernel $k_i(t)$. Red dashed line indicates rCDF for a
normal distribution with $\mu=0$ and $\sigma=1$.
}
\label{fig:app_field_kern-sd}
\end{figure}% Referenced.
\FloatBarrier
\begin{figure}[!ht]
\centering
\includegraphics[width=\textwidth]{figures/fig_invariance_cross_species_thresh_appendix.pdf}
\caption{\textbf{Threshold-dependent intensity invariance of
species-specific feature sets.}
Same processing as in Fig.\,\ref{fig:pipeline_full}, using
different kernel-specific threshold values $\thr$
(multiples of pure-noise standard deviation
$\sigma_{\eta_i}$ of $c_i(t)$ for $\sca=0$. See also
appendix Fig.\,\ref{fig:app_full_kern-sd}). Average value
$\muf$ of each feature $f_i(t)$ over $\sca$.
}
\label{fig:app_cross_species_thresh}
\end{figure}% Reference this one!
\FloatBarrier
\end{document}