paper_2025/main.tex

\documentclass[a4paper, 12pt]{article}

\usepackage[left=2cm,right=2cm,top=2cm,bottom=2cm,includeheadfoot]{geometry}
\usepackage[onehalfspacing]{setspace}
\usepackage{graphicx}
\usepackage{svg}
\usepackage{import}
\usepackage{float}
\usepackage{placeins}
\usepackage{parskip}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{subcaption}
\usepackage[labelfont=bf, textfont=small]{caption}
\usepackage[german,english]{babel}
\addto\captionsenglish{\renewcommand{\figurename}{Fig.}}
\addto\captionsenglish{\renewcommand{\tablename}{Tab.}}
\usepackage[separate-uncertainty=true, locale=DE]{siunitx}
\sisetup{output-exponent-marker=\ensuremath{\mathrm{e}}}
% \usepackage[capitalize]{cleveref}
% \crefname{figure}{Fig.}{Figs.}
% \crefname{equation}{Eq.}{Eqs.}
% \creflabelformat{equation}{#2#1#3}
\usepackage[
    backend=biber,
    style=authoryear,
    pluralothers=true,
    maxcitenames=1,
    mincitenames=1
    ]{biblatex}
\addbibresource{cite.bib}
%\bibdata
%\bibstyle
%\citation

\title{Emergent intensity invariance in a physiologically inspired model of the grasshopper auditory system}
\author{Jona Hartling, Jan Benda}
\date{}

\begin{document}
\maketitle{}

% Text references and citations:
\newcommand{\bcite}[1]{\mbox{\cite{#1}}}
% \newcommand{\fref}[1]{\mbox{\cref{#1}}}
% \newcommand{\fref}[1]{\mbox{Fig.\,\ref{#1}}}
% \newcommand{\eref}[1]{\mbox{\cref{#1}}}
% \newcommand{\eref}[1]{\mbox{Eq.\,\ref{#1}}}

% Subplot lettering:
\newcommand{\figa}{\textbf{a}}
\newcommand{\figb}{\textbf{b}}
\newcommand{\figc}{\textbf{c}}
\newcommand{\figd}{\textbf{d}}
\newcommand{\fige}{\textbf{e}}

% Math shorthands - Standard symbols:
\newcommand{\dec}{\log_{10}} % Logarithm base 10
\newcommand{\infint}{\int_{-\infty}^{+\infty}} % Indefinite integral

% Math shorthands - Spectral filtering:
\newcommand{\bp}{h_{\text{BP}}(t)} % Bandpass filter function
\newcommand{\lp}{h_{\text{LP}}(t)} % Lowpass filter function
\newcommand{\hp}{h_{\text{HP}}(t)} % Highpass filter function
\newcommand{\fc}{f_{\text{cut}}} % Filter cutoff frequency
\newcommand{\tlp}{T_{\text{LP}}} % Lowpass filter averaging interval
\newcommand{\thp}{T_{\text{HP}}} % Highpass filter adaptation interval

% Math shorthands - Early representations:
\newcommand{\raw}{x} % Placeholder input signal
\newcommand{\filt}{\raw_{\text{filt}}} % Bandpass-filtered signal
\newcommand{\env}{\raw_{\text{env}}} % Signal envelope
\newcommand{\db}{\raw_{\text{dB}}} % Logarithmically scaled signal
\newcommand{\dbref}{\raw_{\text{ref}}} % Decibel reference intensity
\newcommand{\adapt}{\raw_{\text{adapt}}} % Adapted signal

% Math shorthands - Kernel parameters:
\newcommand{\kw}{\sigma} % Unspecific Gabor kernel width
\newcommand{\kf}{\omega} % Unspecific Gabor kernel frequency
\newcommand{\kp}{\phi} % Unspecific Gabor kernel phase
\newcommand{\kn}{n} % Unspecific Gabor kernel lobe number
% \newcommand{\ks}{s} % Unspecific Gabor kernel sign
\newcommand{\kwi}{\kw_i} % Specific Gabor kernel width
\newcommand{\kfi}{\kf_i} % Specific Gabor kernel frequency
\newcommand{\kpi}{\kp_i} % Specific Gabor kernel phase
\newcommand{\kni}{\kn_i} % Specific Gabor kernel lobe number
% \newcommand{\ksi}{\ks_i} % Specific Gabor kernel sign

% Math shorthands - Auxiliary kernel parameters:
\newcommand{\fsin}{f_{\text{sin}}} % Carrier frequency
\newcommand{\rh}{h_{\text{rel}}} % Relative Gaussian height for FWRH
\newcommand{\fwrh}{\text{FWRH}} % Gaussian full-width at relative height
\newcommand{\off}{\beta_0} % Offset for linear frequency approximation

% Math shorthands - Thresholding nonlinearity:
\newcommand{\thr}{\Theta_i} % Step function threshold value
\newcommand{\nl}{H(c_i\,-\,\thr)} % Shifted Heaviside step function

% Math shorthands - Intensity invariance analysis:
\newcommand{\soc}{s} % Song component of synthetic mixture
\newcommand{\noc}{\eta} % Noise component of synthetic mixture
\newcommand{\sca}{\alpha} % Multiplicative scale of song component
\newcommand{\xvar}{\sigma_{x}^{2}} % Variance of synthetic mixture
\newcommand{\svar}{\sigma_{\text{s}}^{2}} % Song component variance
\newcommand{\nvar}{\sigma_{\eta}^{2}} % Noise component variance
\newcommand{\pc}{p(c_i,\,T)} % Probability density (general interval)
\newcommand{\pclp}{p(c_i,\,\tlp)} % Probability density (lowpass interval)

\section{Exploring a grasshopper's sensory world}

% Why functional models of sensory systems?
Our scientific understanding of sensory processing systems results from the
distributed accumulation of anatomical, physiological and ethological evidence.
This process is undoubtedly without alternative; however, it leaves us with the
challenge of integrating the available fragments into a coherent whole in order
to address issues such as the interaction between individual system components,
the functional limitations of the system overall, or taxonomic comparisons
between systems that process the same sensory modality. Any unified framework
that captures the essential functional aspects of a given sensory system thus
has the potential to deepen our current understanding and fasciliate systematic
investigations. However, building such a framework is a challenging task. It
requires a wealth of existing knowledge of the system and the signals it
operates on, a clearly defined scope, and careful reduction, abstraction, and
formalization of the underlying structures and mechanisms.

% Why the grasshopper auditory system?
% Why focus on song recognition among other auditory functions?
One sensory system about which extensive information has been gathered over the
years is the auditory system of grasshoppers~(\textit{Acrididae}). Grasshoppers
rely on their sense of hearing primarily for intraspecific communication, which
includes mate attraction~(\bcite{helversen1972gesang}) and
evaluation~(\bcite{stange2012grasshopper}), sender
localization~(\bcite{helversen1988interaural}), courtship
display~(\bcite{elsner1968neuromuskularen}), rival
deterrence~(\bcite{greenfield1993acoustic}), and loss-of-signal predator
alarm~(SOURCE). In accordance with this rich behavioral repertoire,
grasshoppers have evolved a variety of sound production mechanisms to generate
acoustic communication signals for different contexts and ranges using their
wings, hindlegs, or mandibles~(\bcite{otte1970comparative}). Among the most
conspicuous acoustic signals of grasshoppers are their species-specific calling
songs, which broadcast the presence of the singing individual --- mostly the
males of the species --- to potential mates within range. These songs are
usually more characteristic of a species than morphological
traits~(\bcite{tishechkin2016acoustic}; \bcite{tarasova2021eurasius}), which
can vary greatly within species~(\bcite{rowell1972variable};
\bcite{kohler2017morphological}). The reliance on songs to mediate reproduction
represents a strong evolutionary driving force, that resulted in a massive
species diversification~(\bcite{vedenina2011speciation};
\bcite{sevastianov2023evolution}), with over 6800 recognized grasshopper
species in the \textit{Acrididae} family~(\bcite{cigliano2024orthoptera}). It
is this diversity of species, and the crucial role of acoustic communication in
its emergence, that makes the grasshopper auditory system an intriguing
candidate for attempting to construct a functional model framework. As a
necessary reduction, the model we propose here focuses on the pathway
responsible for the recognition of species-specific calling songs, disregarding
other essential auditory functions such as directional
hearing~(\bcite{helversen1984parallel}; \bcite{ronacher1986routes};
\bcite{helversen1988interaural}).

% What are the signals the auditory system is supposed to recognize?
% Why is intensity invariance important for song recognition?
% (Obviously, split this paragraph)
To understand the functional challenges faced by the grasshopper auditory
system, one has to understand the properties of the songs it is designed to
recognize. Grasshopper songs are amplitude-modulated broad-band acoustic
signals. Most songs are produced by stridulation, during which the animal pulls
the serrated stridulatory file on its hindlegs across a resonating vein on the
forewings~(\bcite{helversen1977stridulatory}; \bcite{stumpner1994song};
\bcite{helversen1997recognition}). Every tooth that strikes the vein generates
a brief pulse of sound. Multiple pulses make up a syllable; and the alternation
of syllables and relatively quiet pauses forms a characteristic, through noisy,
waveform pattern. Song recognition depends on certain temporal and structural
parameters of this pattern, such as the duration of syllables and
pauses~(\bcite{helversen1972gesang}), the slope of pulse
onsets~(\bcite{helversen1993absolute}), and the accentuation of syllable onsets
relative to the preceeding pause~(\bcite{balakrishnan2001song};
\bcite{helversen2004acoustic}). The amplitude modulation of the song is
sufficient for recognition~(\bcite{helversen1997recognition}). However, the
essential recognition cues can vary considerably with external physical
factors, which requires the auditory system to be invariant to such variations
in order to reliably recognize songs under different conditions. For instance,
the temporal structure of grasshopper songs warps with
temperature~(\bcite{skovmand1983song}). The auditory system can compensate for
this variability by reading out relative temporal relationships rather than
absolute time intervals~(\bcite{creutzig2009timescale};
\bcite{creutzig2010timescale}), as those remain relatively constant across
different temperatures~(\bcite{helversen1972gesang}). Another, perhaps even
more fundamental external source of song variability lays in the attenuation of
sound intensity with increasing distance to the sender. Sound attenuation
depends on both the frequency content of the signal and the vegetation of the
habitat~(\bcite{michelsen1978sound}). For the receiving auditory system, this
has two major implications. First, the amplitude dynamics of the song pattern
are steadily degraded over distance, which limits the effective communication
range of grasshoppers to~\mbox{1\,-\,2\,m} in their typical grassland
habitats~(\bcite{lang2000acoustic}). Second, the overall intensity level of
songs at the receiver's position varies depending on the location of the
sender, which should ideally not affect the recognition of the song pattern.
This neccessitates that the auditory system achieves a certain degree of
intensity invariance --- a time scale-selective sensitivity to faster amplitude
dynamics and simultaneous insensitivity to slower, more sustained amplitude
dynamics. Intensity invariance in different auditory systems is often
associated with neuronal adaptation~(\bcite{benda2008spike};
\bcite{barbour2011intensity}; \bcite{ozeri2018fast}; more
general:~\bcite{benda2021neural}). In the grasshopper auditory system, a number
of neuron types along the processing chain exhibit spike-frequency adaptation
in response to sustained stimulus
intensities~(\bcite{romer1976informationsverarbeitung};
\bcite{gollisch2004input}; \bcite{hildebrandt2009origin};
\bcite{clemens2010intensity}; \bcite{fisch2012channel}) and thus likely
contribute to the emergence of intensity-invariant song representations. This
means that intensity invariance is not the result of a single processing step
but rather a gradual process, in which different neuronal populations
contribute to varying degrees~(\bcite{clemens2010intensity}) and by different
mechanisms~(\bcite{hildebrandt2009origin}). Approximating this process within a
functional model framework thus requires a considerable amount of
simplification. In this work, we demonstrate that even a small number of basic
physiologically inspired signal transformations --- specifically, pairs of
nonlinear and linear operations --- is sufficient to achieve a meaningful
degree of intensity invariance.

% How can song recognition be modelled functionally (feat. Jan Clemens & Co.)?
% How did we expand on the previous framework?
% (Still can't stand some of this paragraph's structure and wording...)
Invariance to non-informative song variations is crucial for reliable song
recognition; however, it is not sufficient to this end. In order to recognize a
conspecific song as such, the auditory system needs to extract sufficiently
informative features of the song pattern and then integrate the gathered
information into a final categorical percept. Previous authors have proposed a
functional model framework that describes this process --- feature extraction,
evidence accumulation, and categorical decision making --- in both
crickets~(\bcite{clemens2013computational}; \bcite{hennig2014time}) and
grasshoppers~(\bcite{clemens2013feature}; review on
both:~\bcite{ronacher2015computational}). Their framework provides a
comprehensible and biologically plausible account of the computational
mechanisms required for species-specific song recognition, which has served as
the inspiration for the development of the model pathway we propose here. The
existing framework relies on pulse trains as input signals, which were designed
to capture the essential structural properties of natural song
envelopes~(\bcite{clemens2013feature}). In the first step, a bank of parallel
linear-nonlinear feature detectors is applied to the input signal. Each feature
detector consists of a convolutional filter and a subsequent sigmoidal
nonlinearity. The outputs of these feature detectors are temporally averaged to
obtain a single feature value per detector, which is then assigned a specific
weight. The linear combination of weighted feature values results in a single
preference value, that serves as predictor for the behavioral response of the
animal to the presented input signal. Our model pathway adopts the general
structure of the existing framework but modifies it in several key aspects. The
convolutional filters, which have previously been fitted to behavioral data for
each individual species~(\bcite{clemens2013computational}), are replaced by a
larger, generic set of unfitted Gabor basis functions in order to cover a wide
range of possible song features across different species. Gabor functions
approximate the general structure of the filters used in the existing framework
as well as the filter functions found in various auditory
neurons~(\bcite{rokem2006spike}; \bcite{clemens2011efficient};
\bcite{clemens2012nonlinear}). The fitted sigmoidal nonlinearities in the
existing framework consistently exhibited very steep slopes and are therefore
replaced by shifted Heaviside step-functions, which results in a binarization
of the feature detector outputs. Another, more substantial modification is that
the feature detector outputs are temporally averaged in a way that does not
condense them into single feature values but retains their time-varying
structure. This is in line with the fact that songs are no discrete units but
part of a continuous acoustic stream that the auditory system has to process in
real time. Moreover, a time-varying feature representation only stabilizes
after a certain delay following the onset of a song, which emphasizes the
temporal dynamics of evidence accumulation towards a final categorical
decision. The most notable difference between our model pathway and the
existing framework, however, lays in the addition of a physiologically inspired
preprocessing stage, whose starting point corresponds to the initial reception
of airborne sound waves. This allows the model to operate on unmodified
recordings of natural grasshopper songs instead of condensed pulse train
approximations, which widens its scope towards more realistic, ecologically
relevant scenarios. For instance, we were able to investigate the contribution
of different processing stages to the emergence of intensity-invariant song
representations based on actual field recordings of songs at different
distances from the sender.
% Forgive me, it's friday.
In the following, we outline the structure of the proposed model of the
grasshopper auditory pathway, from the initial reception of sound waves up to
the generation of a high-dimensional, time-varying feature representation that
is suitable for species-specific song recognition. We provide a side-by-side
account of the known physiological processing steps and their functional
approximation by basic mathematical operations. We then elaborate on two key
mechanisms that drive the emergence of intensity-invariant song representations
within the auditory pathway.

% SCRAPPED UNTIL FURTHER NOTICE:
% Multi-species, multi-individual communally inhabited environments\\
% - Temporal overlap: Simultaneous singing across individuals/species common\\
% - Frequency overlap: Little speciation into frequency bands (likely unused)\\
% - "Biotic noise": Hetero-/conspecifics ("Another one's songs are my noise")\\
% - "Abiotic noise": Wind, water, vegetation, anthropogenic\\
% - Effects of habitat structure on sound propagation (landscape - soundscape)\\
% $\rightarrow$ Sensory constraints imposed by the (acoustic) environment

% Cluster of auditory challenges (interlocking constraints $\rightarrow$ tight coupling):\\
% From continuous acoustic input, generate neuronal representations that...\\
% 1)...allow for the separation of relevant (song) events from ambient noise floor\\
% 2)...compensate for behaviorally non-informative song variability (invariances)\\
% 3)...carry sufficient information to characterize different song patterns,
% recognize the ones produced by conspecifics, and make appropriate behavioral
% decisions based on context (sender identity, song type, mate/rival quality)

% How can the auditory system of grasshoppers meet these challenges?\\
% - What are the minimum functional processing steps required?\\
% - Which known neuronal mechanisms can implement these steps?\\
% - Which and how many stages along the auditory pathway contribute?\\
% $\rightarrow$ What are the limitations of the system as a whole?

% How can a human observer conceive a grasshopper's auditory percepts?\\
% - How to investigate the workings of the auditory pathway as a whole?\\
% - How to systematically test effects and interactions of processing parameters?\\
% - How to integrate the available knowledge on anatomy, physiology, ethology?\\
% $\rightarrow$ Abstract, simplify, formalize $\rightarrow$ Functional model framework

\section{Developing a functional model of the\\grasshopper song recognition pathway}

% Too long (no splitting, only pruning).
The essence of constructing a functional model of a given system is to gain a
sufficient understanding of the system's essential structural components and
their presumed functional roles; and to then build a formal framework of
manageable complexity around these two aspects. Anatomically, the organization
of the grasshopper song recognition pathway can be outlined as a feed-forward
network of three consecutive neuronal
populations~(Fig.\,\mbox{\ref{fig:pathway}a-c}): Peripheral auditory receptor
neurons, whose axons enter the ventral nerve cord at the level of the
metathoracic ganglion; local interneurons that remain exclusively within the
thoracic region of the ventral nerve cord; and ascending neurons projecting
from the thoracic region towards the supraesophageal
ganglion~(\bcite{rehbein1974structure}; \bcite{rehbein1976auditory};
\bcite{eichendorf1980projections}). The input to the network originates at the
tympanal membrane, which acts as acoustic receiver and is coupled to the
dendritic endings of the receptor neurons~(\bcite{gray1960fine}). The outputs
from the network converge in the supraesophageal ganglion, which is presumed to
harbor the neuronal substrate for conspecific song recognition and response
initiation~(\bcite{ronacher1986routes}; \bcite{bauer1987separate};
\bcite{bhavsar2017brain}). Functionally, the ascending neurons are the most
diverse of the three populations along the pathway. Individual ascending
neurons possess highly specific response properties that contrast with the
rather homogeneous response properties of the preceding receptor neurons and
local interneurons~(\bcite{clemens2011efficient}), indicating a transition from
a uniform population-wide processing stream into several parallel branches.
Based on these anatomical and physiological considerations, the overall
structure of the model pathway is divided into two distinct
stages~(Fig.\,\ref{fig:pathway}d). The preprocessing stage incorporates the
known physiological processing steps at the levels of the tympanal membrane,
the receptor neurons, and the local interneurons; and operates on
one-dimensional signal representations. The feature extraction stage
corresponds to the processing within the ascending neurons and further
downstream towards the supraesophageal ganglion; and operates on
high-dimensional signal representations. The details of each physiological
processing step and its functional approximation within the two stages are
outlined in the following sections.

\begin{figure}[!ht]
    \centering
    \includegraphics[width=\textwidth]{figures/fig_auditory_pathway.pdf}
    \caption{\textbf{Schematic organisation of the song recognition pathway in
                     grasshoppers compared to the structure of the functional
                     model pathway.}
                     \textbf{a}:~Simplified course of the pathway in the
                     grasshopper, from the tympanal membrane over receptor
                     neurons, local interneurons, and ascending neurons further
                     towards the supraesophageal ganglion.
                     \textbf{b}:~Schematic of synaptic connections between
                     the three neuronal populations within the metathoracic
                     ganglion.
                     \textbf{c}:~Network representation of neuronal connectivity.
                     \textbf{d}:~Flow diagram of the different signal
                     representations and transformations along the model
                     pathway. All representations are time-varying. 1st half:
                     Preprocessing stage (one-dimensional). 2nd half: Feature
                     extraction stage (high-dimensional).
                     }
    \label{fig:pathway}
\end{figure}

\subsection{Population-driven signal preprocessing}

Grasshoppers receive airborne sound waves by a tympanal organ at either side of
the body. The tympanal membrane acts as a mechanical resonance filter for
sound-induced vibrations~(\bcite{windmill2008time}; \bcite{malkin2014energy}).
Vibrations that fall within specific frequency bands are focused on different
membrane areas, while others are attenuated. This processing step can be
approximated by an initial bandpass filter
\begin{equation}
    \filt(t)\,=\,\raw(t)\,*\,\bp, \qquad \fc\,=\,5\,\text{kHz},\,30\,\text{kHz}
    \label{eq:bandpass}
\end{equation}
applied to the acoustic input signal $\raw(t)$. The auditory receptor neurons
transduce the vibrations of the tympanal membrane into sequences of action
potentials. Thereby, they encode the amplitude modulation, or envelope, of the
signal~(\bcite{machens2001discrimination}), which likely involves a rectifying
nonlinearity~(\bcite{machens2001representation}). This can be modelled as
full-wave rectification followed by lowpass filtering
\begin{equation}
    \env(t)\,=\,|\filt(t)|\,*\,\lp, \qquad \fc\,=\,500\,\text{Hz}
    \label{eq:env}
\end{equation}
of the tympanal signal $\filt(t)$. Furthermore, the receptors exhibit a
sigmoidal response curve over logarithmically compressed intensity
levels~(\bcite{suga1960peripheral}; \bcite{gollisch2002energy}). In the model
pathway, logarithmic compression is achieved by conversion to decibel scale
\begin{equation}
    \db(t)\,=\,10\,\cdot\,\dec \frac{\env(t)}{\dbref}, \qquad \dbref\,=\,\max\big[\env(t)\big]
    \label{eq:log}
\end{equation}
relative to the maximum intensity $\dbref$ of the signal envelope $\env(t)$.
Both the receptor neurons~(\bcite{romer1976informationsverarbeitung};
\bcite{gollisch2004input}; \bcite{fisch2012channel}) and, on a larger scale,
the subsequent local interneurons~(\bcite{hildebrandt2009origin};
\bcite{clemens2010intensity}) adapt their firing rates in response to sustained
stimulus intensity levels, which allows for the robust encoding of faster
amplitude modulations against a slowly changing overall baseline intensity.
Functionally, the adaptation mechanism resembles a highpass filter
\begin{equation}
    \adapt(t)\,=\,\db(t)\,*\,\hp, \qquad \fc\,=\,10\,\text{Hz}
    \label{eq:highpass}
\end{equation}
over the logarithmically scaled envelope $\db(t)$. This processing step
concludes the preprocessing stage of the model pathway. The resulting
intensity-adapted envelope $\adapt(t)$ is then passed on from the local
interneurons to the ascending neurons, where it serves as the basis for the
following feature extraction stage.

% Cite somewhere:
\begin{figure}[!ht]
    \centering
    \includegraphics[width=\textwidth]{figures/fig_pre_stages.pdf}
    \caption{\textbf{Representations of a song of \textit{O. rufipes} during
                     the preprocessing stage.}
                     \textbf{a}:~Bandpass-filtered tympanal signal.
                     \textbf{b}:~Signal envelope.
                     \textbf{c}:~Logarithmically scaled envelope.
                     \textbf{d}:~Intensity-adapted envelope.
                     }
    \label{fig:pre}
\end{figure}
\FloatBarrier

\subsection{Feature extraction by individual neurons}

The ascending neurons extract and encode a number of different features of the
preprocessed signal. As a population, they hence represent the signal in a
higher-dimensional space than the preceding receptor neurons and local
interneurons. Each ascending neuron is assumed to scan the signal for a
specific template pattern, which can be thought of as a kernel of a particular
structure and on a particular time scale. This process, known as template
matching, can be modelled as a convolution
\begin{equation}
    c_i(t)\,=\,\adapt(t)\,*\,k_i(t)
    = \infint \adapt(\tau)\,\cdot\,k_i(t\,-\,\tau)\,d\tau
    \label{eq:conv}
\end{equation}
of the intensity-adapted envelope $\adapt(t)$ with a kernel $k_i(t)$ per
ascending neuron. We use Gabor kernels as basis functions for creating
different template patterns. An arbitrary one-dimensional, real Gabor kernel is
generated by multiplication of a Gaussian envelope and a sinusoidal carrier
\begin{equation}
    k_i(t,\,\kwi,\,\kfi,\,\kpi)\,=\,e^{-\frac{t^{2}}{2{\kwi}^{2}}}\,\cdot\,\sin(\kfi\,t\,+\,\kpi), \qquad \kfi\,=\,2\pi\fsin
    \label{eq:gabor}
\end{equation}
with Gaussian standard deviation or kernel width $\kwi$, carrier frequency
$\kfi$, and carrier phase $\kpi$. Different combinations of $\kw$ and $\kf$
result in Gabor kernels with different lobe number $\kn$, which is the number
of half-periods of the carrier that fit under the Gaussian envelope within
reasonable limits of attenuation. The interval under the Gaussian envelope that
contains the relevant lobes of the kernel can be defined as Gaussian full-width
measured at relative peak height $\rh$
\begin{equation}
    \fwrh(\kw,\,\rh)\,=\,2\,\cdot\,\sqrt{-2\,\cdot\,\ln \rh}\cdot\,\kw, \qquad \rh\,\in\,(0,\,1]
\end{equation}
With this, an appropriate carrier frequency $\kf$ for obtaining a Gabor kernel
with width $\kw$ and desired lobe number $\kn$ can be approximated as
% \begin{equation}
%     \kf(\kn,\,\fwrh)\,=\,\frac{0.5\,\cdot\,\kn\,+\,\off}{\fwrh}, \qquad \kn\,\geq\,2\enspace\forall\enspace \kn\,\in\,\mathbb{Z}
% \end{equation}
\begin{equation}
    \kf(\kn,\,\kw,\,\rh)\,=\,\frac{\kn\,+\,\off}{4\,\cdot\,\sqrt{-2\,\cdot\,\ln \rh}}, \qquad \kn\,\geq\,2\enspace\forall\enspace \kn\,\in\,\mathbb{Z}
\end{equation}
where $\off$ is a small positive offset to the near-linear relationship between
$\kf$ and $\kn$ to balance the amplitude of the $\kn$ desired lobes of the
kernel --- which should be maximized --- against the amplitude of the
next-outer lobes, which should not exceed the threshold value determined by
$\rh$. For $\kn=1$, carrier frequency $\kf$ is set to zero, which results in a
simple Gaussian kernel. Carrier phase $\kp$ determines the position of the
kernel lobes relative to the kernel center. By setting $\kp$ to one of only
four specific phase values~(Tab.\,\ref{tab:gabor_phases}), we restrict the
Gabor kernels to be either even functions~(mirror-symmetric, uneven $\kn$) or
odd functions~(point-symmetric, even $\kn$) with either positive or negative
sign, which refers to the sign of the kernel's central lobe (even kernels) or
the left of the two central lobes (odd kernels).
\FloatBarrier
\begin{table}[!ht]
    \centering
    \captionsetup{width=.46\textwidth}
    \caption{Values of phase $\kp$ that are specific for the four major groups
             of Gabor kernels.}
    \begin{tabular}{|ccc|}
        \hline
        sign & even kernels & odd kernels\\
        \hline
        $+$ & $+\pi\,/\,2$ & $\pi$\\
        $-$ & $-\pi\,/\,2$ & $0$\\
        \hline
    \end{tabular}
    \label{tab:gabor_phases}
\end{table}
\FloatBarrier
These four major groups of Gabor kernels allow for the extraction of different
types of signal features, such as the presence of peaks (even, $+$), troughs
(even, $-$), onsets (odd, $+$), and offsets (odd, $-$) at various time scales.
Following the convolutional template matching, each kernel-specific response
$c_i(t)$ is passed through a shifted Heaviside step-function $\nl$ with
threshold value $\thr$ to obtain a binary response
\begin{equation}
    b_i(t,\,\thr)\,=\,\begin{cases}
        \;1, \quad c_i(t)\,>\,\thr\\
        \;0, \quad c_i(t)\,\leq\,\thr
    \end{cases}
    \label{eq:binary}
\end{equation}
which can be thought of as a categorization into "relevant" and "irrelevant"
response values. In the grasshopper, these thresholding nonlinearities might
either be part of the processing within the ascending neurons or take place
further downstream~(SOURCE). Finally, the responses of the ascending neurons
are assumed to be integrated somewhere in the supraesophageal
ganglion~(\bcite{ronacher1986routes}; \bcite{bauer1987separate};
\bcite{bhavsar2017brain}). This processing step can be approximated as temporal
averaging of the binary responses $b_i(t)$ by a lowpass filter
\begin{equation}
    f_i(t)\,=\,b_i(t)\,*\,\lp, \qquad \fc\,=\,1\,\text{Hz}
    \label{eq:lowpass}
\end{equation}
to obtain a final set of slowly changing kernel-specific features $f_i(t)$. In
the resulting high-dimensional feature space, different species-specific song
patterns are characterized by a distinct combination of feature values, which
can be read out by a simple linear classifier.

% Cite somewhere:
\begin{figure}[!ht]
    \centering
    \includegraphics[width=\textwidth]{figures/fig_feat_stages.pdf}
    \caption{\textbf{Representations of a song of \textit{O. rufipes} during
                     the feature extraction stage.}
                     Different colors indicate Gabor kernels with different
                     lobe number $\kn$ and sign, with lighter colors for higher
                     $\kn$~($1\,\leq\,\kn\,\leq\,4$; both $+$ and $-$ per $\kn$;
                     two kernel widths $\kw$ of $4\,$ms and $32\,$ms per sign).
                     \textbf{a}:~Kernel-specific filter responses.
                     \textbf{b}:~Binary responses.
                     \textbf{c}:~Finalized features.
                     }
    \label{fig:stages_feat}
\end{figure}
\FloatBarrier

\section{Two mechanisms driving the emergence of intensity-invariant song representation}

% Still missing the SNR analysis. Should be able to write around it for now.
The robustness of song recognition is tied to the degree of intensity
invariance of the finalized feature representation. Ideally, the values of each
feature should depend only on the relative amplitude dynamics of the song
pattern but not on the overall intensity of the song. In the grasshopper, the
emergence of intensity-invariant representations along the song recognition
pathway likely is a distributed process that involves different neuronal
populations, which raises the question of what the essential computational
mechanisms are that drive this process. Within the model pathway, we identified
two key mechanisms that render the song representation more invariant to
intensity variations. The two mechanisms each comprise a nonlinear signal
transformation followed by a linear signal transformation but differ in the
specific operations involved, as outlined in the following sections.

\subsection{Logarithmic compression \& spike-frequency adaptation}

The first notable emergence of intensity invariance along the model pathway
occurs during the transformation of the signal envelope $\env(t)$ into the
logarithmically scaled envelope $\db(t)$ and then into the intensity-adapted
envelope $\adapt(t)$. In order to disentangle the interplay of logarithmic
compression and adaptation, $\env(t)$ can be rewritten as a synthetic mixture
\begin{equation}
    \env(t)\,=\,\sca\,\cdot\,\soc(t)\,+\,\noc(t), \qquad \env(t)\,>\,0\enspace\forall\enspace t\,\in\,\mathbb{R}
    \label{eq:toy_env}
\end{equation}
of a song component $\soc(t)$ with variable multiplicative scale $\sca\geq0$
and a fixed-scale noise component $\noc(t)$. Both $\soc(t)$ and $\noc(t)$ are
assumed to have unit variance. By conversion of $\env(t)$ to decibel
scale~(Eq.\,\ref{eq:log}), $\sca$ turns from a multiplicative scale in linear
space into an additive term, or offset, in logarithmic space
\begin{equation}
    \begin{split}
        \db(t)\,&=\,\log \frac{\alpha\,\cdot\,s(t)\,+\,\eta(t)}{\dbref}\\
        &=\,\log \frac{\alpha}{\dbref}\,+\,\log \left[s(t)\,+\,\frac{\eta(t)}{\alpha}\right]
    \end{split}
    \label{eq:toy_log}
\end{equation}
which allows for its separation from $\soc(t)$ but introduces a scaling of
$\noc(t)$ by the inverse of $\sca$. The subsequent
highpass-filtering~(Eq.\,\ref{eq:highpass}) of $\db(t)$ can then be
approximated as a subtraction of the local offset within a suitable time
interval $0 \ll \thp < \frac{1}{\fc}$:
\begin{equation}
    \begin{split}
    \adapt(t)\,\approx\,\db(t)\,-\,\log \frac{\alpha}{\dbref}\,=\,\log\left[s(t)\,+\,\frac{\eta(t)}{\alpha}\right]
    \end{split}
    \label{eq:toy_highpass}
\end{equation}
This means that $\sca$ cannot be entirely eliminated from $\adapt(t)$, only
redistributed between $\soc(t)$ and $\noc(t)$. In consequence, if $\sca$ is
sufficiently large ($\sca\gg1$), $\noc(t)$ is attenuated to the point of being
negligible, so that $\adapt(t)$ represents $\soc(t)$ in a scale-free manner. If
$\soc(t)$ and $\noc(t)$ are at similar scales ($\sca\approx1$), $\adapt(t)$
largely resembles $\db(t)$. However, if $\sca$ is sufficiently small
($\sca\ll1$), $\noc(t)$ masks $\soc(t)$ even after the intensity adaptation.
Therefore, the effective intensity invariance of $\adapt(t)$ relative to
$\env(t)$ is limited by the initial scaling of $\soc(t)$ relative to $\noc(t)$;
that is, the signal-to-noise ratio (SNR) of $\env(t)$ with ($\sca>0$) and
without ($\sca=0$) song component $\soc(t)$
\begin{equation}
    \text{SNR}(\sca)\,=\,\frac{\xvar}{\nvar}\,=\,\frac{\alpha^{2}\,\cdot\,\svar\,+\,\nvar}{\nvar}\,=\,\alpha^{2}\,+\,1, \qquad \svar\,=\,\nvar\,=\,1
    \label{eq:toy_snr}
\end{equation}
which depends quadratically on $\sca$ if $\soc(t)\perp\noc(t)$. Overall, the
combination of logarithmic compression and adaptation allows for the
equalization of different sufficiently large song scales, which is essential
for intensity-invariant song representation. However, this mechanism is unable
to recover songs that have already sunken below the noise floor, which
emphasizes the importance of a sufficiently high SNR at the intial reception of
the signal for reliable song recognition.

\begin{figure}[!ht]
    \centering
    \includegraphics[width=\textwidth]{figures/fig_invariance_log_hp.pdf}
    \caption{\textbf{Intensity invariance by logarithmic compression and
                     adaptation is restricted by the noise floor.}
                     Synthetic input $\filt(t)$ consists of song component
                     $\soc(t)$ scaled by $\sca$ with (\figc{} and \figd) or
                     without (\figa{} and \figb) additive noise component
                     $\noc(t)$. Input $\filt(t)$ is transformed into envelope
                     $\env(t)$, logarithmically compressed envelope $\db(t)$,
                     and intensity-adapted envelope $\adapt(t)$.
                     \textbf{Left}:~$\env(t)$, $\db(t)$, and $\adapt(t)$ for
                     different scales $\sca$.
                     \textbf{Right}:~Ratios of the standard deviation of
                     $\env(t)$, $\db(t)$, and $\adapt(t)$ relative to the
                     respective reference standard deviation $\sigma_{\eta}$
                     for input $\filt(t)=\noc(t)$.
                     \figa{} and \figb:~Ideally, if $\filt(t)=\sca\cdot\soc(t)$, then
                     $\adapt(t)$ is intensity-invariant across all $\sca$.
                     \figc{} and \figd:~In practice, if
                     $\filt(t)=\sca\cdot\soc(t)+\noc(t)$, the intensity
                     invariance of $\adapt(t)$ is limited to sufficiently large
                     $\sca$. Shaded area indicates saturation of $\adapt(t)$ at
                     $95\,\%$ curve span.
                     }
    \label{fig:inv_log-hp}
\end{figure}
\FloatBarrier

\subsection{Thresholding nonlinearity \& temporal averaging}

\begin{figure}[!ht]
    \centering
    \includegraphics[width=\textwidth]{figures/fig_invariance_thresh_lp_single.pdf}
    \caption{\textbf{Intensity invariance by thresholding and temporal
                     averaging depends on both the threshold value and the
                     noise floor.}
                     Synthetic input $\adapt(t)$ consists of song component
                     $\soc(t)$ scaled by $\sca$ with additive noise component
                     $\noc(t)$. Input $\adapt(t)$ is transformed into kernel
                     response $c_i(t)$, binary response $b_i(t)$, and feature
                     $f_i(t)$. Threshold value $\thr$ is set to multiples of
                     the reference standard deviation $\sigma_{\eta}$ of $c_i(t)$ for input
                     $\adapt(t)=\noc(t)$. Darker colors correspond to higher
                     $\thr$.
                     \textbf{Left}:~$\adapt(t)$, $c_i(t)$, $b_i(t)$, and
                     $f_i(t)$ for different scales $\sca$ and threshold values
                     $\thr$. Left-most column is is the pure-noise reference.
                     \textbf{Right}:~Average value of $f_i(t)$ during the song
                     for the different $\thr$.
                     \figa:~Input $\adapt(t)$.
                     \figb-\figd:~$c_i(t)$, $b_i(t)$, and $f_i(t)$ for the
                     different $\thr$ based on the same $\adapt(t)$ from
                     \figa{}.
                     \fige:~Average value of $f_i(t)$ during the song for
                     the different $\thr$ in \figb{}-\figd.
                     }
    \label{fig:inv_thresh-lp_single}
\end{figure}
\FloatBarrier

\begin{figure}[!ht]
    \centering
    \includegraphics[width=\textwidth]{figures/fig_invariance_thresh_lp_species.pdf}
    \caption{\textbf{Feature representation of different species-specific songs
                     saturates at different points in feature space.}
                     }
    \label{fig:inv_thresh-lp_species}
\end{figure}
\FloatBarrier

\begin{figure}[!ht]
    \centering
    \includegraphics[width=\textwidth]{figures/fig_invariance_full.pdf}
    \caption{\textbf{Step-wise emergence of intensity invariant song
                     representation along the model pathway.}
                     }
    \label{fig:inv_thresh-lp_full}
\end{figure}
\FloatBarrier

The second key mechanism for the emergence of intensity invariance along the
model pathway takes place during the transformation of the kernel responses
$c_i(t)$ over the binary responses $b_i(t)$ into the finalized features
$f_i(t)$. Kernel response $c_i(t)$ quantifies the degree of similarity between
kernel $k_i(t)$ and the preprocessed signal $\adapt(t)$. The thresholding
nonlinearity $\nl$ categorizes the value of $c_i(t)$ at every time point $t$
into "relevant" ($c_i(t)>\thr$, $b_i(t)=1$) and "irrelevant" ($c_i(t)\leq\thr$,
$b_i(t)=0$) response values

By passing $c_i(t)$ through the thresholding
nonlinearity $\nl$, its amplitude values are binned
into one of two categories~(Eq.\,\ref{eq:binary}).

: $c_i(t)>\thr$


This mechanism is mediated by the thresholding nonlinearity $\nl$. By
passing $c_i(t)$ through the thresholding nonlinearity~(Eq.\,\ref{eq:binary}),
its probability density $\pc$ within some observed time interval $T$ is split
around threshold value $\thr$ into two complementary parts:
\begin{equation}
    \int_{\thr}^{+\infty} \pc\,dc_i\,=\,1\,-\,\int_{-\infty}^{\thr} \pc\,dc_i\,=\,\frac{T_1}{T}, \qquad \infint \pc\,dc_i\,=\,1
    \label{eq:pdf_split}
\end{equation}
The right-sided part of the split $\pc$ corresponds to time $T_1$ where
$c_i(t)>\thr$, while the left-sided part corresponds to time $T_0=T-T_1$ where
$c_i(t)\leq\thr$. The semi-definite integral over the right-sided part of $\pc$
represents the ratio of time $T_1$ to total time $T$ because the indefinite
integral of a probability density is normalized to 1. Following the
thresholding nonlinearity, the resulting binary responses $b_i(t)$ are
lowpass-filtered~(Eq.\,\ref{eq:lowpass}) to obtain $f_i(t)$, which can be
approximated as temporal averaging over a suitable time interval
$\tlp>\frac{1}{\fc}$
\begin{equation}
    f_i(t)\,\approx\,\frac{1}{\tlp} \int_{t}^{t\,+\,\tlp} b_i(\tau)\,d\tau\,=\,\frac{T_1}{\tlp}, \qquad b_i(t)\,\in\,\{0,\,1\}
    \label{eq:feat_avg}
\end{equation}
Feature $f_i(t)$

If the lowpass
filter~(Eq.\,\ref{eq:lowpass}) over $b_i(t)$ is approximated as temporal
averaging over a suitable time interval $\tlp>\frac{1}{\fc}$, then $f_i(t)$ can
be linked to a similar temporal ratio
% \begin{equation}
%     f_i(t)\,\approx\,\frac{1}{\tlp} \int_{t}^{t\,+\,\tlp} b_i(\tau)\,d\tau\,=\,\frac{T_1}{\tlp}, \qquad b_i(t)\,\in\,\{0,\,1\}
%     \label{eq:feat_avg}
% \end{equation}
of time $T_1$ during which $b_i(t)$ is 1 within the total averaging interval
$\tlp$. Therefore, the value of $f_i(t)$ at every time point $t$ approximately
signifies the cumulative probability that $c_i(t)$ exceeds $\thr$ during the
corresponding averaging interval $\tlp$:
\begin{equation}
    f_i(t)\,\approx\,\int_{\thr}^{+\infty} \pclp\,dc_i\,=\,P(c_i\,>\,\thr,\,\tlp)
    \label{eq:feat_prop}
\end{equation}
In a sense, $f_i(t)$ resembles a duty cycle of some sort, which quantifies
purely temporal relations in the structure of $c_i(t)$ with no regard for
precise amplitude values apart from their relation to $\thr$.

Accordingly, a substantial amount of information about the degree of similarity
between signal $\adapt(t)$ and kernel $k_i(t)$ that is contained in $c_i(t)$ is
lost during its transformation into $f_i(t)$. Instead, $f_i(t)$ only retains
information about the temporal relation of $c_i(t)$ relative to $\thr$


This loss of amplitude information is the key to the intensity
invariance of $f_i(t)$: For a given $\thr$, different scales of $c_i(t)$ can
still result in similar $T_1$ segments depending on the magnitude of the
derivative of $c_i(t)$ in temporal proximity to time points at which $c_i(t)$
crosses $\thr$. The steeper the slope of $c_i(t)$ around the threshold
crossings, the less $T_1$ changes with scale variations.


In a sense, $f_i(t)$ resembles a duty
cycle of some sort, as it quantifies purely temporal relations in the structure
of $c_i(t)$ with no regard for precise amplitude values apart from their
relation to $\thr$. This near-complete loss of amplitude information is the key
to the intensity invariance of $f_i(t)$: For a given $\thr$, different scales
of $c_i(t)$ can still result in similar $T_1$ segments depending on the
magnitude of the derivative of $c_i(t)$ in temporal proximity to time points at
which $c_i(t)$ crosses $\thr$. The steeper the slope of $c_i(t)$ around the
threshold crossings, the less $T_1$ changes with scale variations.


\section{Discriminating species-specific song\\patterns in feature space}

\section{Conclusions \& outlook}

\textbf{Song recognition pathway: Grasshopper vs. model:}\\
The model pathway includes a rather large number of Gabor kernels compared to
the 15 to 20 ascending neurons in the grasshopper auditory
system~(\bcite{stumpner1991auditory}).


\textbf{Definition of invariance (general, systemic):}\\
Invariance = Property of a system to maintain a stable output with respect to a
set of relevant input parameters (variation to be represented) but irrespective
of one or more other parameters (variation to be discarded)
$\rightarrow$ Selective input-output decorrelation

\textbf{Definition of intensity invariance (context of neurons and songs):}\\
Intensity invariance = Time scale-selective sensitivity to certain faster
amplitude dynamics (song waveform, small-scale AM) and simultaneous
insensitivity to slower, more sustained amplitude dynamics (transient baseline,
large-scale AM, current overall intensity level)\\
$\rightarrow$ Without time scale selectivity, any fully intensity-invariant
output will be a flat line


\textbf{Log-HP: Implication for intensity invariance:}\\
- Logarithmic scaling is essential for equalizing different song intensities\\
$\rightarrow$ Intensity information can be manipulated more easily when in form
of a signal offset in log-space than a multiplicative scale in linear space

- Capability to compensate for intensity variations, i.e. selective amplification
of output $\adapt(t)$ relative to input $\env(t)$, is limited by input SNR (Eq.\,\ref{eq:toy_snr}):\\
$\rightarrow$ Ability to equalize between different sufficiently large scales of $s(t)$\\
$\rightarrow$ Inability to recover $s(t)$ when initially masked by noise floor $\eta(t)$

- Logarithmic scaling emphasizes small amplitudes (song onsets, noise floor) \\
$\rightarrow$ Recurring trade-off: Equalizing signal intensity vs preserving initial SNR

\textbf{Thresh-LP: Implication for intensity invariance:}\\
- Role of song periodicity for feature representation!

- Suggests a relatively simple rule for optimal choice of threshold value $\thr$:\\
$\rightarrow$ Find amplitude $c_i$ that maximizes absolute derivative of $c_i(t)$ over time\\
$\rightarrow$ Optimal with respect to intensity invariance of $f_i(t)$, not necessarily for
other criteria such as song-noise separation or diversity between features

- Nonlinear operations can be used to detach representations from graded physical
stimulus (to fasciliate categorical behavioral decision-making?):\\
1) Capture sufficiently precise amplitude information: $\env(t)$, $\adapt(t)$\\
$\rightarrow$ Closely following the AM of the acoustic stimulus\\
2) Quantify relevant stimulus properties on a graded scale: $c_i(t)$\\
$\rightarrow$ More decorrelated representation, compared to prior stages\\
3) Nonlinearity: Distinguish between "relevant vs irrelevant" values: $b_i(t)$\\
$\rightarrow$ Trading a graded scale for two or more categorical states\\
4) Represent stimulus properties under relevance constraint: $f_i(t)$\\
$\rightarrow$ Graded again but highly decorrelated from the acoustic stimulus\\
5) Categorical behavioral decision-making requires further nonlinearities\\
$\rightarrow$ Parameters of a behavioral response may be graded (e.g. approach speed),
initiation of one behavior over another is categorical (e.g. approach/stay)


\begin{figure}[!ht]
    \centering
    \includegraphics[width=\textwidth]{figures/fig_noise_env_sd_conversion.pdf}
    \caption{\textbf{}
                     }
    \label{}
\end{figure}
\FloatBarrier

\end{document}