2052 lines
120 KiB
TeX
2052 lines
120 KiB
TeX
\documentclass[a4paper, 12pt]{article}
|
|
|
|
\usepackage[left=2cm,right=2cm,top=2cm,bottom=2cm,includeheadfoot]{geometry}
|
|
% \usepackage[onehalfspacing]{setspace}
|
|
\usepackage{graphicx}
|
|
\usepackage{svg}
|
|
\usepackage{import}
|
|
\usepackage{float}
|
|
\usepackage{placeins}
|
|
\usepackage{parskip}
|
|
\usepackage{amsmath}
|
|
\usepackage{amssymb}
|
|
\usepackage{subcaption}
|
|
\usepackage[labelfont=bf, textfont=small]{caption}
|
|
\usepackage[german,english]{babel}
|
|
\addto\captionsenglish{\renewcommand{\figurename}{Fig.}}
|
|
\addto\captionsenglish{\renewcommand{\tablename}{Tab.}}
|
|
\usepackage[separate-uncertainty=true, locale=DE]{siunitx}
|
|
\sisetup{output-exponent-marker=\ensuremath{\mathrm{e}}}
|
|
% \usepackage[capitalize]{cleveref}
|
|
% \crefname{figure}{Fig.}{Figs.}
|
|
% \crefname{equation}{Eq.}{Eqs.}
|
|
% \creflabelformat{equation}{#2#1#3}
|
|
\usepackage[
|
|
backend=biber,
|
|
style=authoryear,
|
|
pluralothers=true,
|
|
maxcitenames=1,
|
|
mincitenames=1
|
|
]{biblatex}
|
|
\addbibresource{cite.bib}
|
|
%\bibdata
|
|
%\bibstyle
|
|
%\citation
|
|
|
|
\title{Emergent intensity invariance vs. signal-to-noise ratio at three consecutive processing stages along the grasshopper song recognition pathway}
|
|
\author{Jona Hartling, Jan Benda}
|
|
\date{}
|
|
|
|
\begin{document}
|
|
\maketitle{}
|
|
|
|
% Text references and citations:
|
|
\newcommand{\bcite}[1]{\mbox{\cite{#1}}}
|
|
% \newcommand{\fref}[1]{\mbox{\cref{#1}}}
|
|
% \newcommand{\fref}[1]{\mbox{Fig.\,\ref{#1}}}
|
|
% \newcommand{\eref}[1]{\mbox{\cref{#1}}}
|
|
% \newcommand{\eref}[1]{\mbox{Eq.\,\ref{#1}}}
|
|
|
|
% Subplot lettering:
|
|
\newcommand{\figa}{\textbf{a}}
|
|
\newcommand{\figb}{\textbf{b}}
|
|
\newcommand{\figc}{\textbf{c}}
|
|
\newcommand{\figd}{\textbf{d}}
|
|
\newcommand{\fige}{\textbf{e}}
|
|
|
|
% Math shorthands - Standard symbols:
|
|
\newcommand{\dec}{\log_{10}} % Logarithm base 10
|
|
\newcommand{\infint}{\int_{-\infty}^{+\infty}} % Indefinite integral
|
|
|
|
% Math shorthands - Spectral filtering:
|
|
\newcommand{\bp}{h_{\text{BP}}(t)} % Bandpass filter function
|
|
\newcommand{\lp}{h_{\text{LP}}(t)} % Lowpass filter function
|
|
\newcommand{\hp}{h_{\text{HP}}(t)} % Highpass filter function
|
|
\newcommand{\fc}{f_{\text{cut}}} % Filter cutoff frequency
|
|
\newcommand{\tlp}{T_{\text{LP}}} % Lowpass filter averaging interval
|
|
\newcommand{\thp}{T_{\text{HP}}} % Highpass filter adaptation interval
|
|
|
|
% Math shorthands - Early representations:
|
|
\newcommand{\raw}{x_{\text{raw}}} % Placeholder input signal
|
|
\newcommand{\filt}{x_{\text{filt}}} % Bandpass filtered signal
|
|
\newcommand{\env}{x_{\text{env}}} % Signal envelope
|
|
\newcommand{\db}{x_{\text{log}}} % Logarithmically scaled signal
|
|
\newcommand{\dbref}{x_{\text{ref}}} % Decibel reference intensity
|
|
\newcommand{\adapt}{x_{\text{adapt}}} % Adapted signal
|
|
|
|
% Math shorthands - Kernel parameters:
|
|
\newcommand{\kw}{\sigma} % Unspecific Gabor kernel width
|
|
\newcommand{\kf}{\omega} % Unspecific Gabor kernel frequency
|
|
\newcommand{\kp}{\phi} % Unspecific Gabor kernel phase
|
|
\newcommand{\kn}{n} % Unspecific Gabor kernel lobe number
|
|
\newcommand{\kwi}{\kw_i} % Specific Gabor kernel width
|
|
\newcommand{\kfi}{\kf_i} % Specific Gabor kernel frequency
|
|
\newcommand{\kpi}{\kp_i} % Specific Gabor kernel phase
|
|
\newcommand{\kni}{\kn_i} % Specific Gabor kernel lobe number
|
|
|
|
% Math shorthands - Auxiliary kernel parameters:
|
|
\newcommand{\fdrm}{\text{FDRM}} % Gaussian full duration relative to maximum
|
|
\newcommand{\rh}{h_{\text{rel}}} % Relative Gaussian height for FDRM calculation
|
|
|
|
% Math shorthands - Thresholding nonlinearity:
|
|
\newcommand{\thr}{\Theta_i} % Step function threshold value
|
|
\newcommand{\nl}{H(c_i\,-\,\thr)} % Shifted Heaviside step function
|
|
|
|
% Math shorthands - Intensity invariance analysis:
|
|
\newcommand{\soc}{s} % Song component of synthetic mixture
|
|
\newcommand{\noc}{\eta} % Noise component of synthetic mixture
|
|
\newcommand{\sca}{\alpha} % Multiplicative scale of song component
|
|
\newcommand{\xvar}{\sigma_{x}^{2}} % Variance of synthetic mixture
|
|
\newcommand{\svar}{\sigma_{\text{s}}^{2}} % Song component variance
|
|
\newcommand{\nvar}{\sigma_{\eta}^{2}} % Noise component variance
|
|
\newcommand{\xsig}{\sigma_x} % Standard deviation of synthetic mixture
|
|
\newcommand{\ssig}{\sigma_{\text{s}}} % Song component standard deviation
|
|
\newcommand{\nsig}{\sigma_{\eta}} % Noise component standard deviation
|
|
\newcommand{\pc}{p(c,\,T)} % Probability density (general interval)
|
|
\newcommand{\pclp}{p(c,\,\tlp)} % Probability density (lowpass interval)
|
|
\newcommand{\pci}{p(c_i,\,\tlp)} % Kernel-specific probability density (lowpass interval)
|
|
\newcommand{\muf}{\mu_{f_i}} % Average feature value
|
|
|
|
\section{Introduction}
|
|
% % Drosophila/visual/article:
|
|
% \bcite{ketkar2023multifaceted}
|
|
|
|
% % Drosophila/auditory/article:
|
|
% \bcite{ozeri2018fast}
|
|
|
|
% % Primate/auditory/review:
|
|
% \bcite{barbour2011intensity}
|
|
|
|
% % Cricket/auditory/article:
|
|
% \bcite{benda2008spike}
|
|
|
|
% % Locust/auditory/article:
|
|
% \bcite{clemens2010intensity}
|
|
|
|
% % Rodent/olfactory/article:
|
|
% \bcite{bolding2018recurrent}
|
|
|
|
% Introduction to intensity invariance:
|
|
Intensity invariance is a fundamental property of sensory systems across
|
|
modalities and species, from fruit flies~(\bcite{ozeri2018fast};
|
|
\bcite{ketkar2023multifaceted}) over crickets~(\bcite{benda2008spike}) and
|
|
grasshoppers~(\bcite{clemens2010intensity}) to
|
|
rodents~(\bcite{bolding2018recurrent}) and
|
|
primates~(\bcite{barbour2011intensity}). It allows for the robust recognition
|
|
of behaviorally relevant stimuli despite variations in stimulus intensity.
|
|
However, the computational mechanisms underlying intensity invariance are often
|
|
difficult to disentangle. Here, we use a physiologically inspired functional
|
|
model of the grasshopper song recognition pathway to investigate the emergence
|
|
of intensity invariance throughout the auditory processing stream.
|
|
|
|
% Why the grasshopper auditory system?
|
|
% Why focus on song recognition among other auditory functions?
|
|
The auditory system of grasshoppers~(\textit{Acrididae}) has been studied
|
|
extensively over the years. Grasshoppers rely on their sense of hearing for
|
|
intraspecific communication --- including mate
|
|
attraction~(\bcite{helversen1972gesang}) and
|
|
evaluation~(\bcite{stange2012grasshopper}), sender
|
|
localization~(\bcite{helversen1988interaural}), courtship
|
|
display~(\bcite{elsner1968neuromuskularen}), and rival
|
|
deterrence~(\bcite{greenfield1993acoustic}) --- and have evolved a variety of
|
|
acoustic signals for different behavioral
|
|
contexts~(\bcite{otte1970comparative}). The most conspicuous acoustic signals
|
|
of grasshoppers are their species-specific calling songs, which broadcast the
|
|
presence of the singing individual to potential mates within range. These songs
|
|
are usually more characteristic of a species than morphological
|
|
traits~(\bcite{tishechkin2016acoustic}; \bcite{tarasova2021eurasius}), which
|
|
can vary greatly within species~(\bcite{rowell1972variable};
|
|
\bcite{kohler2017morphological}). The reliance on songs to mediate reproduction
|
|
represents a strong evolutionary driving force that resulted in a massive
|
|
species diversification~(\bcite{vedenina2011speciation};
|
|
\bcite{sevastianov2023evolution}), with over 6800 recognized species in the
|
|
\textit{Acrididae} family~(\bcite{cigliano2024orthoptera}).
|
|
|
|
% What are the signals that the auditory system is supposed to recognize?
|
|
Grasshopper songs are amplitude-modulated broad-band acoustic signals. They
|
|
consist of a series of noisy syllables and relatively quiet pauses, which form
|
|
a characteristic repetitive pattern~(\bcite{helversen1977stridulatory};
|
|
\bcite{stumpner1994song}). Song recognition depends on certain structural
|
|
parameters of this pattern --- such as the duration of syllables and
|
|
pauses~(\bcite{helversen1972gesang}), the slope of pulse
|
|
onsets~(\bcite{helversen1993absolute}), and the accentuation of syllable onsets
|
|
relative to the preceeding pause~(\bcite{balakrishnan2001song};
|
|
\bcite{helversen2004acoustic}) --- which are sufficiently conveyed by the
|
|
amplitude modulation of the song alone~(\bcite{helversen1997recognition}).
|
|
|
|
% Why is intensity invariance important for song recognition?
|
|
Grasshopper songs, like all acoustic signals, are subject to sound attenuation,
|
|
which depends on the distance from the sound source, the frequency content of
|
|
the signal, and the vegetation of the habitat~(\bcite{michelsen1978sound}).
|
|
Sound attenuation has two major consequences for song recognition. First, the
|
|
amplitude dynamics of the song pattern degrade with increasing distance to the
|
|
sender, which limits the effective communication range of grasshoppers
|
|
to~\mbox{1\,-\,2\,m} in their typical grassland
|
|
habitats~(\bcite{lang2000acoustic}). Second, the intensity of a song at the
|
|
receiver's position varies with the position of the sender, which should
|
|
ideally not affect song recognition. The auditory system thus needs to achieve
|
|
a certain degree of intensity invariance --- a time scale-selective sensitivity
|
|
to faster amplitude dynamics and simultaneous insensitivity to more sustained
|
|
amplitude dynamics. Intensity invariance is commonly associated with neural
|
|
adaptation~(\bcite{benda2008spike}; \bcite{barbour2011intensity};
|
|
\bcite{ozeri2018fast}; more general:~\bcite{benda2021neural}). Different neuron
|
|
types in the grasshopper auditory system exhibit spike-frequency adaptation in
|
|
response to sustained stimulation~(\bcite{romer1976informationsverarbeitung};
|
|
\bcite{gollisch2004input}; \bcite{hildebrandt2009origin};
|
|
\bcite{clemens2010intensity}; \bcite{fisch2012channel}). Accordingly, intensity
|
|
invariance is not the result of a single processing step but rather a gradual
|
|
process, in which different neuronal populations contribute to varying
|
|
degrees~(\bcite{clemens2010intensity}) and by different
|
|
mechanisms~(\bcite{hildebrandt2009origin}).
|
|
|
|
% How did we expand on the previous framework (feat. Clemens et al.)?
|
|
In the current study, we leverage functional modelling to trace the emergence
|
|
of intensity invariance through individual processing steps of the grasshopper
|
|
song recognition pathway. The model pathway we propose here is based on a
|
|
previous functional model framework for song recognition in both
|
|
crickets~(\bcite{clemens2013computational}; \bcite{hennig2014time}) and
|
|
grasshoppers~(\bcite{clemens2013feature}; review on
|
|
both:~\bcite{ronacher2015computational}). The exisiting framework relies on
|
|
pulse trains as input signals, which were designed to capture the essential
|
|
structural properties of natural song envelopes~(\bcite{clemens2013feature}).
|
|
It includes feature extraction by a bank of linear-nonlinear feature detectors,
|
|
evidence accumulation by temporal averaging of each feature, and categorical
|
|
decision making by a weighted linear combination of feature values. We adopted
|
|
the general structure of the existing framework and extended it by a
|
|
physiologically plausible preprocessing stage --- including spectral filtering,
|
|
envelope extraction, logarithmic compression, and intensity adaptation ---
|
|
which allows the model to operate on unmodified recordings of natural
|
|
grasshopper songs. The resulting model pathway thus covers the entire auditory
|
|
processing stream from the initial reception of airborne sound waves to the
|
|
generation of a high-dimensional feature representation that allows for the
|
|
categorical recognition of conspecific songs. It incorporates anatomical,
|
|
physiological, and ethological evidence from several decades of research on the
|
|
grasshopper auditory system. In the following, we provide a side-by-side
|
|
account of the known physiological processing steps along the song recognition
|
|
pathway and their functional approximations in the model pathway. We then
|
|
elaborate on the computational mechanisms that contribute to the emergence of
|
|
intensity-invariant song representations, the interaction between these
|
|
mechanisms, the overall capacity for intensity invariance in the system, and
|
|
the ethological implications of our findings.
|
|
|
|
\newpage
|
|
\section{Methods}
|
|
% This maybe does not quite fit here, but it is the most general part of the
|
|
% methods and applies throughout the whole section, so I put it here for now.
|
|
All modeling, data analysis, and data visualization was performed in
|
|
Python~3.12.3 except for the pathway overview~(Fig.\,\ref{fig:pathway}), which
|
|
was assembled in Inkscape~1.2. The code base for the model pathway is available
|
|
as the \textit{thunderhopper} package, version 1.0, on PyPi. Any audio data was
|
|
inspected and edited with the help of the \textit{audian} package, version 2.4,
|
|
on PyPi.
|
|
|
|
\subsection{Functional model of the grasshopper song recognition pathway}
|
|
|
|
The anatomical organisation of the grasshopper song recognition pathway can be
|
|
outlined as a feed-forward network of three consecutive neuronal
|
|
populations~(Fig.\,\ref{fig:pathway}a-c): Peripheral auditory receptor neurons,
|
|
whose axons enter the ventral nerve cord (VNC) at the level of the metathoracic
|
|
ganglion; local interneurons that remain exclusively within the thoracic region
|
|
of the VNC; and ascending neurons projecting from the thoracic region towards
|
|
the supraesophageal ganglion (SEG), or central
|
|
brain~(\bcite{rehbein1974structure}; \bcite{rehbein1976auditory};
|
|
\bcite{eichendorf1980projections}). The input to the network originates at the
|
|
tympanal membrane, which acts as acoustic receiver and is coupled to the
|
|
dendritic endings of the receptor neurons~(\bcite{gray1960fine}). The outputs
|
|
from the network converge in the SEG, which presumably harbors the neuronal
|
|
substrate for conspecific song recognition and response
|
|
initiation~(\bcite{ronacher1986routes}; \bcite{bauer1987separate};
|
|
\bcite{bhavsar2017brain}).
|
|
|
|
Around 15 to 20 ascending neurons have been identified in the grasshopper
|
|
auditory system~(\bcite{stumpner1991auditory}), whose functional
|
|
characteristics are conserved even between species that are not closely
|
|
related~(\bcite{neuhofer2008evolutionarily}). The population of ascending
|
|
neurons possesses a diverse range of response properties that contrasts with
|
|
the rather homogeneous responses of receptor neurons and local
|
|
interneurons~(\bcite{clemens2011efficient}), which suggests a transition from a
|
|
uniform population-wide processing stream into several parallel branches.
|
|
Accordingly, the model pathway is divided into two distinct
|
|
stages~(Fig.\,\ref{fig:pathway}d): The preprocessing stage incorporates the
|
|
processing steps at the levels of the tympanal membrane, the receptor neurons,
|
|
and the local interneurons; and operates on one-dimensional signal
|
|
representations~(Fig.\,\ref{fig:stages_pre}). The feature extraction stage
|
|
corresponds to the processing within the ascending neurons and further
|
|
downstream towards the SEG; and operates on high-dimensional signal
|
|
representations~(Fig.\,\ref{fig:stages_feat}). The details of each
|
|
physiological processing step and its functional approximation are described in
|
|
the following sections.
|
|
|
|
Around 15 to 20 ascending neurons have been identified in the grasshopper
|
|
auditory system~(\bcite{stumpner1991auditory}), whose functional
|
|
characteristics are conserved even between species that are not closely
|
|
related~(\bcite{neuhofer2008evolutionarily}). The population of ascending
|
|
neurons possesses a diverse range of response properties that contrasts with
|
|
the rather homogeneous responses of receptor neurons and local
|
|
interneurons~(\bcite{clemens2011efficient}), which suggests a transition from a
|
|
uniform population-wide processing stream into several parallel branches.
|
|
Accordingly, the model pathway is divided into two distinct
|
|
stages~(Fig.\,\ref{fig:pathway}d): The preprocessing stage incorporates the
|
|
processing steps at the levels of the tympanal membrane, the receptor neurons,
|
|
and the local interneurons; and operates on one-dimensional signal
|
|
representations~(Fig.\,\ref{fig:stages_pre}). The feature extraction stage
|
|
corresponds to the processing within the ascending neurons and further
|
|
downstream towards the SEG; and operates on high-dimensional signal
|
|
representations~(Fig.\,\ref{fig:stages_feat}). The details of each
|
|
physiological processing step and its functional approximation are described in
|
|
the following sections.
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_auditory_pathway.pdf}
|
|
\caption{\textbf{Schematic organisation of the grasshopper song recognition
|
|
pathway and structure of the functional model pathway.}
|
|
\textbf{a}:~Simplified course of the pathway in the
|
|
grasshopper, from the tympanal membrane over receptor
|
|
neurons, local interneurons, and ascending neurons further
|
|
towards the supraesophageal ganglion.
|
|
\textbf{b}:~Schematic of synaptic connections between
|
|
the three neuronal populations within the metathoracic
|
|
ganglion.
|
|
\textbf{c}:~Network representation of neuronal connectivity.
|
|
\textbf{d}:~Flow diagram of consecutive signal
|
|
representations~(boxes) and transformations~(arrows) along
|
|
the model pathway. All representations are time-varying.
|
|
1st half: Preprocessing stage~(one-dimensional
|
|
representations). 2nd half: Feature extraction
|
|
stage~(high-dimensional representations). }
|
|
\label{fig:pathway}
|
|
\end{figure}
|
|
|
|
\subsubsection{Population-driven signal preprocessing}
|
|
|
|
Grasshoppers receive airborne sound waves by a tympanal organ at each side of
|
|
the body. The tympanal membrane acts as a mechanical resonance filter for
|
|
sound-induced vibrations~(\bcite{windmill2008time}; \bcite{malkin2014energy}).
|
|
Vibrations that fall within specific frequency bands are focused on different
|
|
membrane areas, while others are attenuated. This processing step can be
|
|
approximated by an initial bandpass filter~(Fig.\,\ref{fig:stages_pre}a)
|
|
applied to the acoustic input signal $\raw(t)$:
|
|
\begin{equation}
|
|
\filt(t)\,=\,\raw(t)\,*\,\bp, \qquad \fc\,=\,5\,\text{kHz},\,30\,\text{kHz}
|
|
\label{eq:bandpass}
|
|
\end{equation}
|
|
The receptor neurons transduce the vibrations of the tympanal membrane into
|
|
sequences of action potentials. They thereby encode the amplitude modulation,
|
|
or envelope, of the signal~(\bcite{machens2001discrimination}), which likely
|
|
involves a rectifying nonlinearity~(\bcite{machens2001representation}). The
|
|
extraction of the signal envelope~(Fig.\,\ref{fig:stages_pre}b) can be modelled
|
|
as full-wave rectification followed by lowpass filtering of the tympanal signal
|
|
$\filt(t)$:
|
|
\begin{equation}
|
|
\env(t)\,=\,|\filt(t)|\,*\,\lp, \qquad \fc\,=\,250\,\text{Hz}
|
|
\label{eq:env}
|
|
\end{equation}
|
|
Furthermore, the receptors exhibit a sigmoidal response curve over
|
|
logarithmically compressed stimulus intensities~(\bcite{suga1960peripheral};
|
|
\bcite{gollisch2002energy}). In the model pathway, logarithmic
|
|
compression~(Fig.\,\ref{fig:stages_pre}c) is achieved by conversion to decibel
|
|
scale
|
|
\begin{equation}
|
|
\db(t)\,=\,20\,\cdot\,\dec \frac{\env(t)}{\dbref}, \qquad \dbref\,=\,1
|
|
\label{eq:log}
|
|
\end{equation}
|
|
relative to the common reference intensity $\dbref$. Both the receptor
|
|
neurons~(\bcite{romer1976informationsverarbeitung}; \bcite{gollisch2004input};
|
|
\bcite{fisch2012channel}) and, on a larger scale, the subsequent local
|
|
interneurons~(\bcite{hildebrandt2009origin}; \bcite{clemens2010intensity})
|
|
adapt their firing rates in response to sustained stimulus intensities, which
|
|
allows for the robust encoding of faster amplitude modulations against a slowly
|
|
changing overall baseline intensity. Functionally, the adaptation mechanism
|
|
resembles a highpass filter~(Fig.\,\ref{fig:stages_pre}d) over the
|
|
logarithmically compressed envelope $\db(t)$:
|
|
\begin{equation}
|
|
\adapt(t)\,=\,\db(t)\,*\,\hp, \qquad \fc\,=\,10\,\text{Hz}
|
|
\label{eq:highpass}
|
|
\end{equation}
|
|
This processing step concludes the preprocessing stage of the model pathway.
|
|
The resulting intensity-adapted envelope $\adapt(t)$ is then passed on from the
|
|
local interneurons to the ascending neurons, where it serves as the basis for
|
|
the following feature extraction stage.
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_pre_stages.pdf}
|
|
\caption{\textbf{Song representations during the preprocessing stage.}
|
|
Example song of \textit{O. rufipes}.
|
|
\textbf{a}:~Bandpass filtered tympanal signal $\filt(t)$.
|
|
\textbf{b}:~Signal envelope $\env(t)$.
|
|
\textbf{c}:~Logarithmically compressed envelope $\db(t)$.
|
|
\textbf{d}:~Intensity-adapted envelope $\adapt(t)$.
|
|
}
|
|
\label{fig:stages_pre}
|
|
\end{figure}
|
|
\FloatBarrier
|
|
|
|
\subsubsection{Feature extraction by individual neurons}
|
|
|
|
The population of ascending neurons extracts and encodes a number of different
|
|
features of the preprocessed signal, and hence represents the signal in a
|
|
higher-dimensional space than the preceding receptor neurons and local
|
|
interneurons~(\bcite{clemens2011efficient}). Each ascending neuron is assumed
|
|
to scan the signal for a specific template pattern, which can be thought of as
|
|
a kernel of a particular structure and on a particular time scale. This
|
|
process, known as template matching, can be modelled as a convolution of the
|
|
intensity-adapted envelope $\adapt(t)$ with a kernel $k_i(t)$ specific to the
|
|
$i$-th ascending neuron:
|
|
\begin{equation}
|
|
c_i(t)\,=\,\adapt(t)\,*\,k_i(t)
|
|
= \infint \adapt(\tau)\,\cdot\,k_i(t\,-\,\tau)\,d\tau
|
|
\label{eq:conv}
|
|
\end{equation}
|
|
We use Gabor kernels as basis functions for creating different template
|
|
patterns. Gabor functions presumably capture the essential structural
|
|
properties of the filter functions found in various auditory
|
|
neurons~(\bcite{rokem2006spike}; \bcite{clemens2011efficient};
|
|
\bcite{clemens2012nonlinear}). An arbitrary one-dimensional, real Gabor kernel
|
|
is generated by multiplication of a Gaussian envelope with standard deviation
|
|
or kernel width $\kwi$ and a sinusoidal carrier with frequency $\kfi$ and phase
|
|
$\kpi$:
|
|
\begin{equation}
|
|
k_i(t,\,\kwi,\,\kfi,\,\kpi)\,=\,e^{-\frac{t^{2}}{2{\kwi}^{2}}}\,\cdot\,\sin(\kfi\,t\,+\,\kpi), \qquad \kfi\,=\,2\pi f_{\text{sin}_i}
|
|
\label{eq:gabor}
|
|
\end{equation}
|
|
Different combinations of $\kwi$ and $\kfi$ result in Gabor kernels with
|
|
different lobe number $\kni$, which is the number of half-periods of the
|
|
carrier that fit under the Gaussian envelope within reasonable limits of
|
|
attenuation. The time window under the Gaussian envelope that contains the
|
|
relevant lobes of the kernel can be defined as Gaussian full duration at height
|
|
$\rh$ relative to the maximum of the Gaussian:
|
|
\begin{equation}
|
|
\fdrm(\kwi,\,\rh)\,=\,2\,\cdot\,\sqrt{-2\,\cdot\,\ln \rh}\cdot\,\kwi, \qquad \rh\,\in\,(0,\,1]
|
|
\label{eq:fdrm}
|
|
\end{equation}
|
|
% Yes, FDRM is a hideous acronym. Based on the common "full width at half
|
|
% maximum" (FWHM) and adjusted because "full duration at half maximum" (FDHM)
|
|
% is apparently preferred in a temporal context. Alternatively, "w_\text{gauss}"?
|
|
With this, an appropriate carrier frequency $\kfi$ for obtaining a Gabor kernel
|
|
with width $\kwi$ and desired lobe number $\kni$ can be approximated as
|
|
\begin{equation}
|
|
\kfi(\kni,\,\kwi,\,\rh)\,=\,\frac{0.5\,\cdot\,\kni\,+\,\beta_0}{\fdrm(\kwi,\,\rh)}, \qquad \kni\,\geq\,2\enspace\forall\enspace \kni\,\in\,\mathbb{Z}
|
|
\label{eq:gabor_freq}
|
|
\end{equation}
|
|
% \begin{equation}
|
|
% \kfi(\kni,\,\kwi,\,\rh)\,=\,\frac{0.5\,\cdot\,\kni\,+\,\beta_0}{2\,\cdot\,\sqrt{-2\,\cdot\,\ln \rh}\cdot\kwi}, \qquad \kni\,\geq\,2\enspace\forall\enspace \kni\,\in\,\mathbb{Z}
|
|
% \end{equation}
|
|
The relationship between $\kfi$ and $\kni$ is approximately linear except for
|
|
small $\kni$. The offset term $\beta_0\approx0.26$ was added to balance the
|
|
amplitudes of the $\kni$ desired lobes of the kernel --- which should be
|
|
maximized --- against the amplitudes of the next-outer lobes, which should not
|
|
exceed the threshold value determined by $\rh$. Note that simple Gaussian
|
|
kernels with $\kni=1$ can be obtained by setting the carrier frequency to
|
|
$\kfi=0$ and are hence not covered by Eq.\,\ref{eq:gabor_freq}.
|
|
|
|
Carrier phase $\kpi$ determines the position of the kernel lobes relative to
|
|
the kernel center. We restrict the Gabor kernels to be either even or odd
|
|
functions by setting $\kpi$ to one of only four specific phase
|
|
values~(Tab.\,\ref{tab:gabor_phases}). Even Gabor kernels are mirror-symmetric
|
|
with uneven $\kni$, whereas odd Gabor kernels are point-symmetric with even
|
|
$\kni$. Both even and odd kernels can have either positive or negative sign,
|
|
which refers to the sign of the kernel's central lobe (even kernels) or the
|
|
left of the two central lobes (odd kernels). These four major groups of Gabor
|
|
kernels allow for the extraction of different types of signal features, such as
|
|
the presence of peaks (even, $+$), troughs (even, $-$), onsets (odd, $+$), and
|
|
offsets (odd, $-$) at various time scales.
|
|
\FloatBarrier
|
|
\begin{table}[!ht]
|
|
\centering
|
|
\captionsetup{width=.45\textwidth}
|
|
\caption{Values of phase $\kp$ that are specific for the four major groups
|
|
of Gabor kernels.}
|
|
\begin{tabular}{|ccc|}
|
|
\hline
|
|
sign & even kernels & odd kernels\\
|
|
\hline
|
|
$+$ & $+\pi\,/\,2$ & $\pi$\\
|
|
$-$ & $-\pi\,/\,2$ & $0$\\
|
|
\hline
|
|
\end{tabular}
|
|
\label{tab:gabor_phases}
|
|
\end{table}
|
|
\FloatBarrier
|
|
Following the convolutional template matching~(Fig.\,\ref{fig:stages_feat}a),
|
|
each kernel-specific response $c_i(t)$ is passed through a shifted Heaviside
|
|
step-function $\nl$ with threshold value $\thr$ to obtain a binary
|
|
response~(Fig.\,\ref{fig:stages_feat}b):
|
|
\begin{equation}
|
|
b_i(t,\,\thr)\,=\,\begin{cases}
|
|
\;1, \quad c_i(t)\,>\,\thr\\
|
|
\;0, \quad c_i(t)\,\leq\,\thr
|
|
\end{cases}
|
|
\label{eq:binary}
|
|
\end{equation}
|
|
The thresholding of $c_i(t)$ into $b_i(t)$ can be thought of as a
|
|
categorization into "relevant" and "irrelevant" response values. Similar
|
|
thresholding nonlinearities have been a crucial processing step in previous
|
|
models that deal with the extraction of behaviorally relevant song features in
|
|
insects~(\bcite{clemens2013computational}; \bcite{clemens2013feature};
|
|
\bcite{hennig2014time}; \bcite{ronacher2015computational}).
|
|
% However, there is no direct physiological evidence that would allow to
|
|
% determine the exact location or underlying mechanism of such a nonlinearity in
|
|
% either the ascending neurons or at some point further downstream in the SEG.
|
|
|
|
In the grasshopper, the responses of the ascending neurons are assumed to be
|
|
integrated somewhere in the SEG~(\bcite{ronacher1986routes};
|
|
\bcite{bauer1987separate}; \bcite{bhavsar2017brain}). In the model pathway,
|
|
temporal integration is implemented as temporal averaging of the binary
|
|
responses $b_i(t)$ by a lowpass filter with extremely low cutoff frequency:
|
|
\begin{equation}
|
|
f_i(t)\,=\,b_i(t)\,*\,\lp, \qquad \fc\,=\,1\,\text{Hz}
|
|
\label{eq:lowpass}
|
|
\end{equation}
|
|
This processing step results in a set of slowly changing kernel-specific
|
|
features $f_i(t)$, which is the final representation along the model
|
|
pathway~(Fig.\,\ref{fig:stages_feat}c). In the resulting high-dimensional
|
|
feature space, different species-specific song patterns can be distinguished by
|
|
their distinct combination of feature values, e.\,g. using Euclidian geometry
|
|
or a simple linear classifier.
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_feat_stages.pdf}
|
|
\caption{\textbf{Song representations during the feature extraction stage.}
|
|
Example song of \textit{O. rufipes}.
|
|
Different color shades indicate different types of Gabor
|
|
kernels with specific lobe number $\kni$ and either $+$ or
|
|
$-$ sign, sorted (dark to light) first by increasing
|
|
$\kni$ and then by sign~($1\,\leq\,\kni\,\leq\,4$; first
|
|
$+$, then $-$ for each $\kni$; two kernel widths $\kwi$ of
|
|
$4\,$ms and $32\,$ms per type; 8 types, 16 kernels in
|
|
total).
|
|
\textbf{a}:~Kernel-specific filter responses $c_i(t)$.
|
|
\textbf{b}:~Binary responses $b_i(t)$.
|
|
\textbf{c}:~Finalized features $f_i(t)$.}
|
|
\label{fig:stages_feat}
|
|
\end{figure}
|
|
\FloatBarrier
|
|
|
|
\subsection{Simulation-based analysis of the model pathway}
|
|
|
|
\subsubsection{Data sourcing}
|
|
|
|
All simulations were based on a dataset that was assembled from five different
|
|
sources, each of which is an established reference for the identification of
|
|
European grasshopper species. The dataset was limited to six species from the
|
|
species-rich \textit{Gomphocerinae} sub-family that are known to be common
|
|
throughout Central and Southern Europe. All recordings were converted to
|
|
standard~\textit{.wav}~format with a sampling rate of~44.1\,kHz and an
|
|
amplitude scale in arbitrary units. Individual songs were then cut from each
|
|
recording. The dataset includes a total of 31 recordings across species, which
|
|
amounts to a total of 153 isolated songs. However, the number of available
|
|
species-specific songs varies greatly across species, with a maximum of 48
|
|
songs for \textit{C. biguttulus} and a minimum of 6 songs for \textit{C.
|
|
mollis}~(Tab.\,\ref{tab:species_list}).
|
|
|
|
\begin{itemize}
|
|
\item "Heuschrecken beobachten, bestimmen" by~Heiko~Bellmann\\
|
|
1$^{\text{st}}$\,edition, 1993, Naturbuch, Augsburg
|
|
\item "Gesänge der heimischen Heuschrecken. Akustisch-optische
|
|
Bestimmungshilfe."\\
|
|
by~Karl-Heinz~Garberding, Deutscher Jugendbund für Naturbeobachtung\\
|
|
1$^{\text{st}}$\,edition, 2001, DJN, Göttingen
|
|
\item "Heuschrecken -- Die Stimmen von 61 heimischen Arten"
|
|
by~Heiko~Bellmann\\
|
|
1$^{\text{st}}$\,edition, 2004, AMPLE, Germering
|
|
\item "Fauna d'Italia XLVIII -- Orthoptera" by~Bruno~Massa, Paolo~Fontana,
|
|
Filippo~M.~Buzzetti, Roy~M.J.C.~Kleukers, Baudewijn~Odé\\
|
|
1$^{\text{st}}$\,edition, 2012, edagricola, Milano
|
|
\item "Singing Orthoptera of Slovenia" by~Stanislav~Gomboc, Blaz~Segula\\
|
|
1$^{\text{st}}$\,edition, 2014, EGEA, Ljubljana
|
|
\end{itemize}
|
|
|
|
\begin{table}[!ht]
|
|
\centering
|
|
\captionsetup{width=.75\textwidth}
|
|
\caption{Overview of the six grasshopper species from the
|
|
\textit{Gomphocerinae} sub-family, the number of sources per species, the
|
|
number of available recordings across sources, and the number of isolated
|
|
songs across recordings.}
|
|
\begin{tabular}{|lccc|}
|
|
\hline
|
|
\textbf{Species} & \textbf{Sources} & \textbf{Recordings} & \textbf{Songs}\\
|
|
\hline
|
|
\textit{Chorthippus biguttulus} & 5 & 6 & 48\\
|
|
\textit{Chorthippus mollis} & 3 & 3 & 6\\
|
|
\textit{Chrysochraon dispar} & 4 & 5 & 45\\
|
|
\textit{Gomphocerippus rufus} & 4 & 8 & 16\\
|
|
\textit{Omocestus rufipes} & 4 & 5 & 14\\
|
|
\textit{Pseudochorthippus parallelus} & 4 & 4 & 24\\
|
|
\hline
|
|
\end{tabular}
|
|
\label{tab:species_list}
|
|
\end{table}
|
|
|
|
\subsubsection{Generation of synthetic input signals}
|
|
|
|
Different processing steps along the model pathway were tested for intensity
|
|
invariance by generating synthetic input signals $x(t)$ of varying intensity,
|
|
transforming them through the respective processing steps, and comparing the
|
|
resulting signal representations. Inputs were generated for two distinct cases.
|
|
In the idealized, noiseless case, $x(t)$ consists of a song component $\soc(t)$
|
|
with $\ssig=1$ and a multiplicative scale $\sca$:
|
|
\begin{equation}
|
|
x(t)\,=\,\sca\,\cdot\,\soc(t), \qquad \sca\,\geq\,0
|
|
\label{eq:noiseless}
|
|
\end{equation}
|
|
In the noiseless case, $x(t)$ is hence only a scaled version of $\soc(t)$ with
|
|
$\xsig=\sca$. In the more realistic, noisy case, $x(t)$ consists of the same
|
|
song component $\soc(t)$ scaled by $\sca$ and an additive noise component
|
|
$\noc(t)$ with $\nsig=1$:
|
|
\begin{equation}
|
|
x(t)\,=\,\sca\,\cdot\,\soc(t)\,+\,\noc(t), \qquad \sca\,\geq\,0
|
|
\label{eq:noisy}
|
|
\end{equation}
|
|
Accordingly, the signal-to-noise ratio (SNR) of input $x(t)$ in the noisy case
|
|
equals the squared $\sca$ value:
|
|
\begin{equation}
|
|
\text{SNR}_x(\sca)\,=\,\frac{(\sca\,\cdot\,\ssig)^2}{\nsig^2}\,=\,\sca^2, \qquad \ssig\,=\,\nsig\,=\,1
|
|
\label{eq:input_snr}
|
|
\end{equation}
|
|
For most analyses, it would be sufficient if input $x(t)$ corresponds to the
|
|
signal representation immediately before the first of the tested
|
|
transformations. For instance, when testing the effects of logarithmic
|
|
compression~(Eq.\,\ref{eq:log}), $x(t)$ would correspond to the signal envelope
|
|
$\env(t)$. However, in this particular case, $\env(t)$ results from a nonlinear
|
|
transformation~(Eq.\,\ref{eq:env}), which cannot be synthesized as an additive
|
|
mixture of $\soc(t)$ and $\noc(t)$. For this reason, any input $x(t)$ across
|
|
all analyses corresponds not to the representation immediately before the
|
|
tested transformations but its predecessor representation instead. Therefore,
|
|
when testing logarithmic compression, $x(t)$ corresponds to the tympanal signal
|
|
$\filt(t)$ instead of $\env(t)$.
|
|
|
|
The raw $\soc(t)$ was drawn from the dataset of isolated species-specific song
|
|
recordings, whereas the raw $\noc(t)$ consists of a segment of normally
|
|
distributed white noise. Both $\soc(t)$ and $\noc(t)$ were normalized to unit
|
|
standard deviation. These can be used without further processing for all
|
|
analyses where input $x(t)$ corresponds to $\raw(t)$. For analyses where $x(t)$
|
|
corresponds to a later representation, $\soc(t)$ and $\noc(t)$ were first
|
|
processed along the model pathway up to the required representation, again
|
|
normalized to unit standard deviation, and then used to generate $x(t)$
|
|
according to either Eq.\,\ref{eq:noiseless} in the noiseless case or
|
|
Eq.\,\ref{eq:noisy} in the noisy case.
|
|
|
|
\subsubsection{Quantifying signal intensity across representations}
|
|
\label{sec:intensity_measures}
|
|
|
|
All intensity measures were calculated over a manually labeled segment within
|
|
each song. Segments always excluded the first and last few syllables to allow
|
|
slowly changing representations such as $f_i(t)$ to stabilize. The duration of
|
|
each segment and the number of contained syllables depends on the duration of
|
|
the species-specific song. Care was taken to ensure that the segment contained
|
|
a sufficient number of syllables to obtain a reliable estimate of the intensity
|
|
measures.
|
|
|
|
The standard deviation $\sigma$ was used as a measure of intensity for all
|
|
representations resulting from the transformation of input $x(t)$ up to and
|
|
including the kernel responses $c_i(t)$, for which individual $\sigma_{c_i}$
|
|
were used as kernel-specific intensity measures. The binary responses $b_i(t)$
|
|
were deemed to similar to the features $f_i(t)$ to warrant their own intensity
|
|
measure and were hence omitted from all related analyses. For $f_i(t)$,
|
|
$\sigma$ is not an appropriate intensity measure because each $f_i(t)$ is
|
|
ideally constant with $\sigma=0$ for the duration of a song. Therefore, the
|
|
average value $\muf$ of each $f_i(t)$ was used as a kernel-specific intensity
|
|
measure instead.
|
|
|
|
It is arguably not ideal to quantify the intensity of $c_i(t)$ and $f_i(t)$
|
|
separately for each kernel. Overall, these representations are not separate
|
|
signals bundled together but rather a set that acts as a unit with a single
|
|
intensity measure. However, there is no straightforward way to quantify the
|
|
intensity of $c_i(t)$ or $f_i(t)$ as a whole that would not entail a certain
|
|
ambiguity, e.\,g by averaging across kernels. In this sense, we opted for the
|
|
kernel-specific approach because it allows to asses differences in the
|
|
dependency on $\sca$ between individual members of either $c_i(t)$ and
|
|
$f_i(t)$.
|
|
|
|
The absolute intensity measures allow to compare the intensity of a
|
|
representation across different $\sca$ values. Additionally, ratios were
|
|
calculated between the intensity measures for $\sca>0$ and the respective
|
|
pure-noise reference measure for $\sca=0$ to better compare the intensities of
|
|
different representations. This is only possible in the noisy case, where input
|
|
$x(t)=\noc(t)$ for $\sca=0$, whereas $x(t)=0$ for $\sca=0$ in the noiseless
|
|
case. At the level of input $x(t)$, the ratio of intensity measures depends on
|
|
the square root of $\sca$:
|
|
\begin{equation}
|
|
\frac{\xsig}{\nsig}\,=\,\sqrt{\frac{\xsig^2}{\nsig^2}}\,=\,\sqrt{\frac{(\sca\,\cdot\,\ssig)^2\,+\,\nsig^2}{\nsig^2}}\,=\,\sqrt{\sca^2\,+\,1}, \qquad \ssig\,=\,\nsig\,=\,1
|
|
\label{eq:input_ratio}
|
|
\end{equation}
|
|
This holds only if $\soc(t)\perp\noc(t)$, so that $\xsig^2=\ssig^2+\nsig^2$,
|
|
which is a reasonable assumption for the raw $\soc(t)$ and $\noc(t)$. However,
|
|
the dependency of the ratio on $\sca$ is not necessarily the same for
|
|
representations that are transformed from $x(t)$ by nonlinear operations, since
|
|
these change the relationship of $\soc(t)$ and $\noc(t)$ in an unpredictable
|
|
fashion~(see appendix Fig.\,\ref{fig:app_env-sd}). Furthermore, the ratio is
|
|
not a proper SNR of the representation because it does not relate $\soc(t)$ to
|
|
$\noc(t)$ within the representation but rather the entire representation to
|
|
$\noc(t)$ alone. However, it still provides a useful measure of the relative
|
|
intensity of a representation with and without $\soc(t)$, which is the closest
|
|
we can get to the SNR of the representation. As such, the ratio of intensity
|
|
measures is referred to as SNR in the following.
|
|
% Is this legal? "SNR" is much shorter than "ratio of intensity measure to the pure-noise reference measure".
|
|
% Haven't used it much yet, sticked to "ratio" in most cases.
|
|
|
|
\subsection{Field data-based analysis of the model pathway}
|
|
|
|
Field recordings were taken on a meadow in the vicinity of the University of
|
|
Tübingen, Germany, during the day in August~2024. All recordings were taken
|
|
using a custom hand-held microphone array that was assembled from eight
|
|
omnidirectional AV-TEFE TCM141 condenser microphones. The microphones were
|
|
arranged in a linear configuration with a spacing of 30\,cm between adjacent
|
|
microphones and oriented in the same direction along the axis of the array. All
|
|
microphones were connected to a custom 8-channel amplificitation and
|
|
digitization system based on a Teensy 4.1 microcontroller with real-time clock
|
|
and microSD card storage. Recordings were written to the microSD card
|
|
in~\textit{.wav}~format with a sampling rate of 96\,kHz and an amplitude scale
|
|
in arbitrary units. The microphone array was held at a height of approximately
|
|
30\,cm above the ground, which was slightly above the height of most
|
|
surrounding vegetation and at the same height as the singing grasshopper. The
|
|
array was moved as close to the grasshopper as possible without interrupting
|
|
its song production, which amounts to an approximate offset distance of 10\,cm
|
|
between the animal and the leading microphone. Care was taken to maintain a
|
|
stable position and height of the microphone array during recording. The
|
|
resulting recordings were then processed through the model pathway and analyzed
|
|
according to the procedure described in Section~\ref{sec:intensity_measures}.
|
|
|
|
\subsection{Determining kernel-specific threshold values}
|
|
|
|
Different kernels $k_i(t)$ result in specific kernel responses $c_i(t)$,
|
|
Eq.\,\ref{eq:conv}, which are then transformed further into binary responses
|
|
$b_i(t)$, Eq.\,\ref{eq:binary}, by thresholding nonlinearity $\nl$. The
|
|
threshold value $\thr$ is specific to each $k_i(t)$. Across all analyses,
|
|
$\thr$ has been specified as a multiple of the pure-noise reference standard
|
|
deviation $\sigma_{c_i}$ for input $x(t)=\noc(t)$. This ensures that $\thr$ as
|
|
well as the resulting $b_i(t)$ and $f_i(t)$ are comparable across different
|
|
$k_i(t)$ because each pure-noise $c_i(t)$ approximately follows a normal
|
|
distribution around zero~(see appendix
|
|
Figs.\,\ref{fig:app_thresh-lp_kern-sd}-\ref{fig:app_field_kern-sd}).
|
|
|
|
\newpage
|
|
\section{Results}
|
|
|
|
\subsection{Mechanisms driving the emergence of intensity invariance}
|
|
|
|
It is not necessary to test each processing step along the model pathway for
|
|
intensity invariance. Instead, we can focus on those steps that involve
|
|
nonlinear transformations, since these are the only steps that can potentially
|
|
change the dependency on scale $\sca$ between the input and output
|
|
representations. Overall, there are three nonlinear transformations along the
|
|
model pathway: Full-wave rectification during envelope extraction, logarithmic
|
|
compression, and the thresholding nonlinearity during feature extraction. In
|
|
the following, we analyze the effects of each of these transformations on the
|
|
intensity and SNR of the resulting representations as well as their potential
|
|
contribution to intensity invariance.
|
|
|
|
\subsubsection{Full-wave rectification \& lowpass filtering}
|
|
|
|
The first nonlinear transformation along the model pathway is the full-wave
|
|
rectification of the tympanal signal $\filt(t)$ during the extraction of the
|
|
signal envelope (Eq.\,\ref{eq:env}). Rectification transforms the distribution
|
|
of $\filt(t)$ from an approximately zero-centered distribution with both
|
|
positive and negative values into a strictly non-negative distribution. Signal
|
|
envelope $\env(t)$ is then obtained by lowpass filtering the rectified
|
|
$\filt(t)$. The effects of this transformation pair on SNR and potential
|
|
intensity invariance were analyzed by rescaling and processing the input signal
|
|
$\raw(t)$ and comparing standard deviations between the resulting $\filt(t)$
|
|
and $\env(t)$, once for the noiseless case~(Fig.\,\ref{fig:rect-lp}a) and once
|
|
for the noisy case~(Fig.\,\ref{fig:rect-lp}b). In addition, the cutoff
|
|
frequency $\fc$ of the lowpass filter was varied to investigate the influence
|
|
of different filter bandwidths. In the noiseless case, the standard deviations
|
|
of $\filt(t)$ and $\env(t)$ are each reduced compared to the input $\raw(t)$ by
|
|
a multiplicative factor. These factors are constant across all $\sca$, which
|
|
results in a downward shift of the respective curve on a double-logarithmic
|
|
scale, away from the diagonal~(Fig.\,\ref{fig:rect-lp}c). For $\filt(t)$, the
|
|
reduction is a consequence of the bandpass filtering~(Eq.\,\ref{eq:bandpass})
|
|
of $\raw(t)$. For $\env(t)$, the standard deviation is further reduced compared
|
|
to $\filt(t)$. Rectification contributes much less to this reduction than
|
|
lowpass filtering. The degree of reduction by lowpass filtering depends on the
|
|
cutoff frequency $\fc$, with lower $\fc$ (narrow bandwidth) resulting in a
|
|
stronger reduction. In the noisy case, the standard deviations of $\filt(t)$
|
|
and $\env(t)$ can be related to the respective pure-noise reference standard
|
|
deviation~(Fig.\,\ref{fig:rect-lp}d). This causes each curve to start with a
|
|
constant regime of SNR values near 1 for smaller $\sca$, which reflects the
|
|
dominance of the noise component $\noc(t)$ over the song component $\soc(t)$ in
|
|
the input $\raw(t)$. For larger $\sca$, all curves transition into a regime of
|
|
linearly increasing SNR on a double-logarithmic scale. For $\filt(t)$, the
|
|
linear part of the curve deviates only slightly from the diagonal. For
|
|
$\env(t)$, however, the transition occurs at lower $\sca$ compared to
|
|
$\filt(t)$, and the linear part of the curve is shifted leftward away from the
|
|
diagonal, which means that higher SNR values are achieved for the same $\sca$.
|
|
This effect is more pronounced for lower $\fc$ of the lowpass filter and is
|
|
presumably caused by the attenuation of high-frequency components in the
|
|
signal, which are more prominent in the noise component $\noc(t)$ than in the
|
|
song component $\soc(t)$. The effect also appears relatively consistent across
|
|
different species, although small variations exist~(Fig.\,\ref{fig:rect-lp}e
|
|
and appendix Fig.\,\ref{fig:app_rect-lp}). In summary, the standard deviation
|
|
of $\env(t)$ has never been observed to saturate for larger $\sca$ but rather
|
|
continues to increase proportionally to $\sca$ for all tested $\fc$, in both
|
|
the noiseless and the noisy case and across different species. Consequently,
|
|
the combination of rectification and lowpass filtering does not contribute to
|
|
intensity invariance. However, this transformation pair does improve the SNR of
|
|
$\env(t)$ relative to $\filt(t)$ and thus provides subsequent processing stages
|
|
with a more robust input representation and higher input SNR.
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_invariance_rect_lp.pdf}
|
|
\caption{\textbf{Rectification and lowpass filtering improves SNR
|
|
but does not contribute to intensity invariance.}
|
|
Input $\raw(t)$ consists of $\soc(t)$ scaled by $\sca$ with
|
|
optional $\noc(t)$ and is successively transformed into
|
|
tympanal signal $\filt(t)$ and envelope $\env(t)$.
|
|
\textbf{Top}:~Examples of $\filt(t)$ and $\env(t)$ for
|
|
different $\sca$.
|
|
\textbf{a}:~Noiseless case.
|
|
\textbf{b}:~Noisy case.
|
|
\textbf{Bottom}:~Intensity measures over $\sca$. Different
|
|
line styles indicate different cutoff frequencies $\fc$ of the
|
|
lowpass filter extracting $\env(t)$.
|
|
\textbf{c}:~Noiseless case: Standard deviation $\sigma_x$ of
|
|
$\filt(t)$ and $\env(t)$, respectively.
|
|
\textbf{d}:~Noisy case: Ratio of $\sigma_x$ to the respective
|
|
pure-noise reference $\sigma_{\eta}$ for $\sca=0$.
|
|
\textbf{e}:~Ratio of $\sigma_x$ to $\sigma_{\eta}$ of
|
|
$\env(t)$ as in \textbf{d} for different species (averaged
|
|
over songs and recordings, appendix
|
|
Fig.\,\ref{fig:app_rect-lp}).
|
|
}
|
|
\label{fig:rect-lp}
|
|
\end{figure}
|
|
\FloatBarrier
|
|
|
|
\subsubsection{Logarithmic compression \& spike-frequency adaptation}
|
|
|
|
The second nonlinear transformation along the model pathway is the logarithmic
|
|
compression of the signal envelope $\env(t)$ into $\db(t)$, Eq.\,\ref{eq:log},
|
|
which is then followed by the highpass filtering of $\db(t)$,
|
|
Eq.\,\ref{eq:highpass}, to obtain the intensity-adapted envelope $\adapt(t)$.
|
|
The interplay of this transformation pair was analyzed by rescaling and
|
|
processing the input signal $\filt(t)$ and comparing standard deviations
|
|
between the resulting $\env(t)$, $\db(t)$, and $\adapt(t)$. It is necessary to
|
|
use $\filt(t)$ as input for this analysis instead of $\env(t)$, because
|
|
$\env(t)$ results from a nonlinear transformation and hence cannot be
|
|
synthesized as an additive mixture of song component $\soc(t)$ and noise
|
|
component $\noc(t)$. % <-- Sentence may be methods section material.
|
|
However, it is much easier to conceive a mathematical description of the
|
|
effects of logarithmic compression and adaptation if $\env(t)$ itself is
|
|
assumed to be composed of $\soc(t)$ and $\noc(t)$. In the noiseless
|
|
case~(Fig.\,\ref{fig:log-hp}a), $\env(t)$ takes the form of
|
|
\begin{equation}
|
|
\env(t)\,=\,\sca\,\cdot\,\soc(t), \qquad \env(t)\,>\,0\enspace\forall\enspace t\,\in\,\mathbb{R}
|
|
\label{eq:toy_env_pure}
|
|
\end{equation}
|
|
The standard deviation of $\env(t)$ increases linearly with $\sca$ on a
|
|
double-logarithmic scale and is slightly reduced~(Fig.\,\ref{fig:log-hp}c)
|
|
compared to the input $\filt(t)$, which is consistent with the results of the
|
|
previous analysis~(Fig.\,\ref{fig:rect-lp}c). By conversion of $\env(t)$ to
|
|
decibel scale, $\sca$ turns from a multiplicative scale in linear space into an
|
|
additive term, or offset, in logarithmic space:
|
|
\begin{equation}
|
|
\db(t)\,=\,20\,\cdot\,\dec \left[\,\sca\,\cdot\,s(t)\,\right]\,=\,20\,\cdot\,\left[\dec \sca\,+\,\dec s(t)\right], \qquad \sca\,>\,0
|
|
\label{eq:toy_log_pure}
|
|
\end{equation}
|
|
The highpass filtering of $\db(t)$ can be approximated as a subtraction of the
|
|
local signal offset within a suitable time interval $0 \ll \thp <
|
|
\frac{1}{\fc}$:
|
|
\begin{equation}
|
|
\begin{split}
|
|
\adapt(t)\,\approx\,\db(t)\,-\,20\,\cdot\,\dec \sca\,=\,20\,\cdot\,\dec s(t)
|
|
\end{split}
|
|
\label{eq:toy_highpass_pure}
|
|
\end{equation}
|
|
This eliminates $\sca$ from $\adapt(t)$ and thus renders it perfectly
|
|
intensity-invariant, with a constant standard deviation of around 10\,dB across
|
|
all $\sca>0$~(Fig.\,\ref{fig:log-hp}c). In contrast, in the noisy
|
|
case~(Fig.\,\ref{fig:log-hp}b), $\env(t)$ takes the form of
|
|
\begin{equation}
|
|
\env(t)\,=\,\sca\,\cdot\,\soc(t)\,+\,\noc(t), \qquad \env(t)\,>\,0\enspace\forall\enspace t\,\in\,\mathbb{R}
|
|
\label{eq:toy_env_noise}
|
|
\end{equation}
|
|
Similar to the previous analysis~(Fig.\,\ref{fig:rect-lp}d), the ratio of the
|
|
standard deviation of $\env(t)$ to its pure-noise reference standard deviation
|
|
on a double-logarithmic scale follows a constant regime for small $\sca$ and a
|
|
linearly increasing regime for larger $\sca$~(Fig.\,\ref{fig:log-hp}d). Decibel
|
|
conversion of $\env(t)$
|
|
% \begin{equation}
|
|
% \begin{split}
|
|
% \db(t)\,&=\,20\,\cdot\,\dec \left[\,\sca\,\cdot\,s(t)\,+\,\eta(t)\,\right]\\
|
|
% &=\,20\,\cdot\,\left(\dec \sca\,+\,\dec \left[s(t)\,+\,\frac{\eta(t)}{\sca}\right]\right), \qquad \sca\,>\,0
|
|
% \end{split}
|
|
% \label{eq:toy_log_noise}
|
|
% \end{equation}
|
|
\begin{equation}
|
|
\db(t)\,=\,20\,\cdot\,\left(\dec \sca\,+\,\dec \left[s(t)\,+\,\frac{\eta(t)}{\sca}\right]\right), \qquad \sca\,>\,0
|
|
\label{eq:toy_log_noise}
|
|
\end{equation}
|
|
allows for the separation of $\sca$ from $\soc(t)$ but introduces a scaling of
|
|
$\noc(t)$ by the inverse of $\sca$, which remains present even after the offset
|
|
subtraction:
|
|
\begin{equation}
|
|
\begin{split}
|
|
\adapt(t)\,\approx\,20\,\cdot\,\dec\left[s(t)\,+\,\frac{\eta(t)}{\sca}\right]
|
|
\end{split}
|
|
\label{eq:toy_highpass_noise}
|
|
\end{equation}
|
|
% \begin{equation}
|
|
% \begin{split}
|
|
% \adapt(t)\,\approx\,\db(t)\,-\,20\,\cdot\,\dec \sca\,=\,20\,\cdot\,\dec\left[s(t)\,+\,\frac{\eta(t)}{\sca}\right]
|
|
% \end{split}
|
|
% \label{eq:toy_highpass_noise}
|
|
% \end{equation}
|
|
This means that, in the noisy case, $\sca$ cannot be entirely eliminated from
|
|
$\adapt(t)$, only redistributed between $\soc(t)$ and $\noc(t)$. If $\sca$ is
|
|
sufficiently large ($\sca\gg1$, saturation regime), $\noc(t)$ is attenuated to
|
|
the point of being negligible, so that $\adapt(t)$ is a scale-free
|
|
representation of $\soc(t)$. If $\sca$ and $\noc(t)$ are at similar scales
|
|
($\sca\approx1$, transient regime), $\adapt(t)$ largely resembles $\db(t)$.
|
|
Finally, if $\sca$ is sufficiently small ($0<\sca\ll1$, noise regime),
|
|
$\noc(t)$ masks $\soc(t)$ even after the intensity adaptation. Accordingly, the
|
|
effective intensity invariance of $\adapt(t)$ through logarithmic compression
|
|
and adaptation is limited by the SNR of $\env(t)$: Songs that have already
|
|
sunken into the noise floor at the level of $\env(t)$ cannot be recovered by
|
|
subsequent processing steps. The general pattern of noise regime, transient
|
|
regime, and saturation regime remains consistent across different
|
|
species~(Fig.\,\ref{fig:log-hp}e). However, the saturation point --- the $\sca$
|
|
value at which the SNR of $\adapt(t)$ starts to saturate --- and the saturation
|
|
level --- the constant SNR of $\adapt(t)$ within the saturation regime --- vary
|
|
considerably between and within species~(appendix
|
|
Figs.\,\ref{fig:app_log-hp_curves}+\ref{fig:app_log-hp_saturation}). For
|
|
example, \textit{C. biguttulus} and \textit{C. mollis} display a noticably
|
|
lower saturation level compared to other species. These differences are not to
|
|
be underestimated, since the saturation level of $\adapt(t)$ determines the
|
|
maximum input SNR for subsequent processing steps. In other words, the fact
|
|
that $\adapt(t)$ eventually reaches a saturation regime is, of course,
|
|
desirable in the context of intensity invariance, but it also means to pass up
|
|
on the higher SNR values that are achieved by $\env(t)$ for the same $\sca$ (up
|
|
to several orders of magnitude, Fig.\,\ref{fig:log-hp}d). This trade-off
|
|
between intensity invariance and SNR is a recurring phenomenon that is further
|
|
addressed in the following sections.
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_invariance_log_hp.pdf}
|
|
\caption{\textbf{Intensity invariance through logarithmic compression and
|
|
adaptation is restricted by the noise floor and decreases
|
|
SNR.}
|
|
Input $\filt(t)$ consists of $\soc(t)$
|
|
scaled by $\sca$ with optional $\noc(t)$
|
|
and is successively transformed into envelope $\env(t)$,
|
|
logarithmically compressed envelope $\db(t)$, and
|
|
intensity-adapted envelope $\adapt(t)$.
|
|
\textbf{Top}:~Examples of $\env(t)$, $\db(t)$, and
|
|
$\adapt(t)$ for different $\sca$.
|
|
\textbf{a}:~Noiseless case.
|
|
\textbf{b}:~Noisy case.
|
|
\textbf{Bottom}:~Intensity measures over $\sca$.
|
|
\textbf{c}:~Noiseless case: Standard deviation $\sigma_x$
|
|
of $\env(t)$, $\db(t)$, and $\adapt(t)$, respectively.
|
|
\textbf{d}:~Noisy case: Ratio of $\sigma_x$ to the
|
|
respective pure-noise reference $\sigma_{\eta}$ for
|
|
$\sca=0$. Shaded areas indicate $5\,\%$ (dark grey) and
|
|
$95\,\%$ (light grey) curve span for $\adapt(t)$.
|
|
\textbf{e}:~Ratio of $\sigma_x$ to $\sigma_{\eta}$ of
|
|
$\adapt(t)$ as in \textbf{d} for different species
|
|
(averaged over songs and recordings, appendix
|
|
Fig.\,\ref{fig:app_log-hp_curves}). Dots indicate $95\,\%$
|
|
curve span per species.
|
|
}
|
|
\label{fig:log-hp}
|
|
\end{figure}
|
|
\FloatBarrier
|
|
|
|
\subsubsection{Thresholding nonlinearity \& temporal averaging}
|
|
|
|
The third nonlinear transformation along the model pathway is the thresholding
|
|
nonlinearity $\nl$ that transforms each kernel response $c_i(t)$ into a binary
|
|
binary response $b_i(t)$, Eq.\,\ref{eq:binary}. This transformation takes place
|
|
after the convolutional filtering of $\adapt(t)$ with kernel $k_i(t)$,
|
|
Eq.\,\ref{eq:conv}, and is followed by the temporal averaging of $b_i(t)$ into
|
|
the feature set $f_i(t)$ by a lowpass filter, Eq.\,\ref{eq:lowpass}. The
|
|
effects of thresholding and temporal averaging are best illustrated based on a
|
|
single kernel~(Fig.\,\ref{fig:thresh-lp_single}) instead of the full set. For
|
|
this analysis, input $\adapt(t)$ was
|
|
rescaled~(Fig.\,\ref{fig:thresh-lp_single}a) and convolved with kernel $k(t)$.
|
|
The resulting kernel response $c(t)$ was passed through $H(c\,-\,\Theta)$ with
|
|
three different threshold values
|
|
$\Theta$~(Fig.\,\ref{fig:thresh-lp_single}b-d). Each resulting binary response
|
|
$b(t)$ was transformed into $f(t)$, whose average feature value $\mu_f$ serves
|
|
as a measure of intensity~(Fig.\,\ref{fig:thresh-lp_single}ef). The
|
|
thresholding nonlinearity $H(c\,-\,\Theta)$ categorizes the values of $c(t)$
|
|
into "relevant" ($c(t)>\Theta$, $b(t)=1$) and "irrelevant" ($c(t)\leq\Theta$,
|
|
$b(t)=0$) response values. It thereby splits the probability density $\pc$ of
|
|
$c(t)$ within some observed time interval $T$ into two complementary parts
|
|
around $\Theta$:
|
|
\begin{equation}
|
|
\int_{\Theta}^{+\infty} \pc\,dc\,=\,1\,-\,\int_{-\infty}^{\Theta} \pc\,dc\,=\,\frac{T_1}{T}, \qquad \infint \pc\,dc\,=\,1
|
|
\label{eq:pdf_split}
|
|
\end{equation}
|
|
The right-sided part of the split $\pc$ corresponds to time $T_1$ where
|
|
$c(t)>\Theta$, while the left-sided part corresponds to time $T_0=T-T_1$ where
|
|
$c(t)\leq\Theta$. The semi-definite integral over the right-sided part of $\pc$
|
|
represents the ratio of time $T_1$ to total time $T$ because the indefinite
|
|
integral of a probability density is normalized to 1. The lowpass filtering of
|
|
$b(t)$ can be approximated as temporal averaging over a suitable time interval
|
|
$\tlp>\frac{1}{\fc}$ in order to express $f(t)$ as a similar temporal ratio
|
|
\begin{equation}
|
|
f(t)\,\approx\,\frac{1}{\tlp} \int_{t}^{t\,+\,\tlp} b(\tau)\,d\tau\,=\,\frac{T_1}{\tlp}, \qquad b(t)\,\in\,\{0,\,1\}
|
|
\label{eq:feat_avg}
|
|
\end{equation}
|
|
of time $T_1$ during which $b(t)$ is 1 within the averaging interval $\tlp$.
|
|
Therefore, the value of $f(t)$ at every time point $t$ approximately signifies
|
|
the cumulative probability that $c(t)$ exceeds $\Theta$ during the
|
|
corresponding averaging interval $\tlp$:
|
|
\begin{equation}
|
|
f(t)\,\approx\,\int_{\Theta}^{+\infty} \pclp\,dc\,=\,P(c\,>\,\Theta,\,\tlp)
|
|
\label{eq:feat_prop}
|
|
\end{equation}
|
|
In a sense, $f(t)$ can be interpreted as some sort of duty cycle with respect
|
|
to $\Theta$. For example, a feature value of $f(t)=0.4$ means that $c(t)$
|
|
exceeds $\Theta$ for approximately 40\,\% of the time within $\tlp$ around $t$.
|
|
In the most extreme cases, $\Theta$ lays either above the maximum of $c(t)$ or
|
|
below the minimum of $c(t)$, which results in a minimum or maximum possible
|
|
feature value of $f(t)=0$~(Fig.\,\ref{fig:thresh-lp_single}d, left column) or
|
|
$f(t)=1$, respectively.
|
|
|
|
Importantly, $f(t)$ neither retains information about the timing of individual
|
|
threshold crossings nor the precise values of $c(t)$ apart from their relation
|
|
to $\Theta$. Accordingly, for a given $\Theta$, different $\sca$ can still
|
|
result in similar $T_1$ segments (and hence similar feature values) depending
|
|
on the magnitude of the derivative of $c(t)$ in temporal proximity to time
|
|
points at which $c(t)$ crosses $\Theta$: The steeper the slope of $c(t)$, the
|
|
less $T_1$ changes with variations in $\sca$. The most reliable way of
|
|
exploiting this invariant porperty of $f(t)$ is to set $\Theta$ to a value near
|
|
0, because these values are least affected by different scales of $c(t)$. For
|
|
sufficiently large $\sca$, $f(t)$ then approaches the same constant $\mu_f$ in
|
|
both the noiseless and the noisy case~(Fig.\,\ref{fig:thresh-lp_single}e,
|
|
saturation regime).
|
|
|
|
The saturation level of $f(t)$ is independent of the precise value of $\Theta$,
|
|
but the saturation point decreases with
|
|
$\Theta$~(Fig.\,\ref{fig:thresh-lp_single}e). Therefore, a threshold value of
|
|
$\Theta=0$ would be the optimal choice for achieving intensity invariance at
|
|
the lowest possible $\sca$. In stark contrast, the closer $\Theta$ is to 0, the
|
|
higher $\mu_f$ in response to the pure noise component $\noc(t)$ and the lower
|
|
the resulting SNR of $f(t)$ between noise regime and saturation
|
|
regime~(Fig.\,\ref{fig:thresh-lp_single}b-d, left column, and
|
|
Fig.\,\ref{fig:thresh-lp_single}e). This trade-off between intensity invariance
|
|
and SNR has already been observed during the previous analysis on logarithmic
|
|
compression and adaptation~(Fig.\,\ref{fig:log-hp}d).
|
|
|
|
Finally, the effects of thresholding and temporal averaging must be seen in the
|
|
context of the previous transformation pair of logarithmic compression and
|
|
adaptation: In the current analysis, the input $\adapt(t)$ can be rescaled by
|
|
arbitrarily large $\sca$, while in the full pathway, the current input
|
|
$\adapt(t)$ is the output $\adapt(t)$ of the previous transformation pair and
|
|
is hence capped to a maximum standard deviation of around
|
|
10\,dB~(Fig.\,\ref{fig:log-hp}cd). This can be illustrated by plotting $\mu_f$
|
|
not over $\sca$~(Fig.\,\ref{fig:thresh-lp_single}e) but over the standard
|
|
deviation of input $\adapt(t)$ instead~(Fig.\,\ref{fig:thresh-lp_single}f). It
|
|
becomes apparent that $\mu_f$ saturates only for standard deviations of
|
|
$\adapt(t)$ that would already be capped. Accordingly, $f(t)$ never reaches the
|
|
saturation regime as determined by the current transformation pair but rather
|
|
adheres to the saturation regime determined by the previous transformation
|
|
pair. In this case, the saturated $\mu_f$ is not independent of $\Theta$
|
|
anymore. The consequences of this interaction between the two mechanisms of
|
|
intensity invariance are further explored in a later section.
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_invariance_thresh_lp_single.pdf}
|
|
\caption{\textbf{Intensity invariance through thresholding and temporal
|
|
averaging is mediated by the interaction of threshold
|
|
value and noise floor.}
|
|
Input $\adapt(t)$ consists $\soc(t)$ scaled by $\sca$ with
|
|
optional $\noc(t)$ and is transformed into single kernel
|
|
response $c(t)$, binary response $b(t)$, and feature
|
|
$f(t)$. Different color shades indicate different
|
|
threshold values $\Theta$ (multiples of pure-noise
|
|
standard deviation $\sigma_{\eta}$ of $c(t)$ for $\sca=0$,
|
|
with darker colors for higher $\Theta$. See also appendix
|
|
Fig.\,\ref{fig:app_thresh-lp_kern-sd}).
|
|
\textbf{Left}:~Noisy case: Examples of $\adapt(t)$ as well
|
|
as $c(t)$, $b(t)$, and $f(t)$ for different $\sca$.
|
|
\textbf{a}:~$\adapt(t)$ with kernel $k(t)$ in black.
|
|
\textbf{b\,-\,d}: $c(t)$, $b(t)$, and $f(t)$ based on the
|
|
same $\adapt(t)$ from \textbf{a} but for different
|
|
$\Theta$.
|
|
\textbf{Right}:~Average value $\mu_f$ of $f(t)$ for each
|
|
$\Theta$ from \textbf{b\,-\,d}. Dots indicate $95\,\%$
|
|
curve span (noisy case).
|
|
\textbf{e}:~$\mu_f$ over $\sca$, once for the noisy case
|
|
(solid lines) and once for the noiseless case (dotted
|
|
lines).
|
|
\textbf{f}:~Noisy case: $\mu_f$ over standard deviation
|
|
$\sigma_{\text{adapt}}$ of input $\adapt$ corresponding to
|
|
$\sca$ shown in \textbf{e}. Shaded area indicates values
|
|
of $\sigma_{\text{adapt}}$ that are capped in the output
|
|
$\adapt(t)$ of the previous transformation pair
|
|
(Fig.\,\ref{fig:log-hp}cd).
|
|
}
|
|
\label{fig:thresh-lp_single}
|
|
\end{figure}
|
|
\FloatBarrier
|
|
|
|
\subsection{Intensity invariance of species-specific feature representations}
|
|
|
|
Having established both the meaning of the feature value and the mechanism of
|
|
intensity invariance by thresholding and temporal averaging, the question
|
|
remains how this mechanism acts on a set of features $f_i(t)$ based on
|
|
different species-specific songs~(Fig.\,\ref{fig:thresh-lp_species}a). The
|
|
previous analysis was repeated with three different kernels $k_i(t)$ using a
|
|
single kernel-specific threshold value $\thr$; and the resulting average
|
|
feature values $\muf$ were plotted over
|
|
$\sca$~(Fig.\,\ref{fig:thresh-lp_species}bc). Additionally, 2D feature spaces
|
|
spanned by each pair of $f_i(t)$ were plotted to investigate the separability
|
|
of species-specific songs based on the feature representation in dependence of
|
|
$\sca$~(Fig.\,\ref{fig:thresh-lp_species}de). Each species-specific combination
|
|
of $\muf$ follows a trajectory through feature space that develops with $\sca$.
|
|
These trajectories correspond to the transient regime between the constant
|
|
(noise) regime and the saturation regime, which are only visible as the start
|
|
and end points of the trajectories, respectively. The horizontal dashes in the
|
|
colorbars indicate the range of $\sca$ that corresponds to the transient regime
|
|
across $f_i(t)$ for each species.
|
|
|
|
In the noiseless case, each $\muf$ is 0 for small $\sca$ across all
|
|
species~(Fig.\,\ref{fig:thresh-lp_species}b) because $c_i(t)$ never exceeds
|
|
$\thr$. Accordingly, each trajectory starts at the origin of the feature
|
|
space~(Fig.\,\ref{fig:thresh-lp_species}d). For larger $\sca$, all $\muf$
|
|
saturate at individual values whose combination differs between species, so
|
|
that the songs of each species are eventually represented by distinct points in
|
|
feature space. However, the species-specific trajectories cross each other at
|
|
numerous points, which means that the songs of two species --- each at a
|
|
specific $\sca$ --- can result in the same combination of $\muf$. Furthermore,
|
|
the specific saturation point of $f_i(t)$ depends on the species: For
|
|
\textit{C. mollis}, all $\muf$ saturate around the same $\sca$, while
|
|
\textit{O. rufipes} exhibits considerable variation between the three $f_i(t)$.
|
|
The larger the variation in saturation points between $f_i(t)$, the stronger
|
|
the curvature of the trajectory through feature space.
|
|
|
|
In the noisy case, $\muf$ is non-zero even for the smallest
|
|
$\sca$~(Fig.\,\ref{fig:thresh-lp_species}c) because the addition of the noise
|
|
component $\noc(t)$ to input $\adapt(t)$ drives $c_i(t)$ above $\thr$
|
|
regardless of the song component $\soc(t)$. The starting value of $\muf$ is the
|
|
same across all $f_i(t)$ and species by construction of the specific $\thr$. In
|
|
consequence, the trajectories through feature space do not start at the origin
|
|
but rather at approximately the same point along the
|
|
diagonal~(Fig.\,\ref{fig:thresh-lp_species}e). For larger $\sca$, all $\muf$
|
|
saturate at the same values as in the noiseless case, as expected from the
|
|
previous analysis~(Fig.\,\ref{fig:thresh-lp_single}e). However, the
|
|
trajectories now move a much shorter distance through feature space for a
|
|
similar range of $\sca$ due to the lower SNR of $f_i(t)$ between noise regime
|
|
and saturation regime, which increases the likelihood of trajectories crossing
|
|
each other. Finally, the saturation points of $f_i(t)$ for a given species are
|
|
slightly higher in the noisy case, but the variation between $f_i(t)$ remains
|
|
largely unchanged.
|
|
|
|
In summary, even a comparably small set of three features $f_i(t)$ can, in
|
|
principle, represent different species-specific songs at distinct points in
|
|
feature space, regardless of the presence of noise. However, this only holds
|
|
for sufficiently large $\sca$ that allow $f_i(t)$ to reach a saturation regime.
|
|
During the transient regime, the species-specific combination of $\muf$ can
|
|
very well be the same for two or more different species at specific $\sca$,
|
|
although this may be alleviated by the inclusion of additional $f_i(t)$.
|
|
Overall, the results of this analysis suggest that $\thr$ should rather be
|
|
choosen in favor of a higher SNR ($\thr$ just above pure-noise $c_i(t)$) than a
|
|
lower saturation point ($\thr\to0$). First, because this reduces the density of
|
|
trajectories through feature space, and second, because the capping of
|
|
$\adapt(t)$ by the previous transformation pair likely renders the saturation
|
|
point of $f_i(t)$ less relevant.
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_invariance_thresh_lp_species.pdf}
|
|
\caption{\textbf{Feature representation of different species-specific songs
|
|
saturates at different points in feature space.}
|
|
Same input and processing as in
|
|
Fig.\,\ref{fig:thresh-lp_single} but with three different
|
|
kernels $k_i$ and a single kernel-specific threshold value
|
|
$\thr=0.5\cdot\sigma_{\eta_i}$ (appendix
|
|
Fig.\,\ref{fig:app_thresh-lp_kern-sd}).
|
|
\textbf{a}:~Examples of species-specific grasshopper
|
|
songs.
|
|
\textbf{Middle}:~Average value $\muf$ of each feature
|
|
$f_i(t)$ over $\sca$ per species (averaged over songs and
|
|
recordings, appendix Figs.\,\ref{fig:app_thresh-lp_pure}
|
|
and \ref{fig:app_thresh-lp_noise}). Different color shades
|
|
indicate different $k_i$. Dots indicate $95\,\%$ curve
|
|
span per $k_i$.
|
|
\textbf{b}:~Noiseless case.
|
|
\textbf{c}:~Noisy case.
|
|
\textbf{Bottom}:~2D feature spaces spanned by each pair of
|
|
$f_i(t)$. Each trajectory corresponds to a
|
|
species-specific combination of $\muf$ that develops
|
|
with $\sca$ (colorbars). Horizontal dashes in the colorbar
|
|
indicate $5\,\%$ (dark grey) and $95\,\%$ (light grey)
|
|
curve span of the norm across all three $\muf$ per
|
|
species.
|
|
\textbf{d}:~Noiseless case.
|
|
\textbf{e}:~Noisy case. Shaded areas indicate the average
|
|
minimum $\muf$ across all species-specific trajectories.
|
|
}
|
|
\label{fig:thresh-lp_species}
|
|
\end{figure}
|
|
\FloatBarrier
|
|
|
|
\subsection{Intensity invariance along the full model pathway}
|
|
|
|
Through the previous analyses, we could establish two mechanisms of intensity
|
|
invariance: Logarithmic compression and adaptation as well as thresholding and
|
|
temporal averaging. While each transformation pair by itself can provide some
|
|
level of invariance, certain results suggest that the first mechanism may
|
|
actually limit or even nullify the effect of the second mechanism. In the
|
|
following sections, we investigate the combined effect of both mechanisms along
|
|
the full model pathway~(Fig.\,\ref{fig:pipeline_full}) and explore the
|
|
consequences of disabling the first mechanism by skipping the logarithmic
|
|
compression step~(Fig.\,\ref{fig:pipeline_short}).
|
|
|
|
\subsubsection{Including logarithmic compression}
|
|
|
|
For this analysis, input $\raw(t)$ --- including both song component $\soc(t)$
|
|
and noise component $\noc(t)$ --- was rescaled and processed throughout all
|
|
steps of the model pathway~(Fig.\,\ref{fig:pipeline_full}a) up to the feature
|
|
set $f_i(t)$. As before, the standard deviation was used as intensity measure
|
|
for each resulting representation except $b_i(t)$ and $f_i(t)$. For $f_i(t)$,
|
|
the average feature value $\muf$ was used, while $b_i(t)$ was omitted from the
|
|
analysis. Plotting each intensity measure over
|
|
$\sca$~(Fig.\,\ref{fig:pipeline_full}b) reinforces many of the previous
|
|
observations. For ease of visualization, the kernel-specific curves for
|
|
$c_i(t)$ and $f_i(t)$ were summarized by their median. Representations prior to
|
|
logarithmic compression --- $\filt(t)$ and $\env(t)$ --- show a linear increase
|
|
of the intensity measure for larger $\sca$ on a double-logarithmic scale.
|
|
Representations after logarithmic compression --- $\db(t)$, $\adapt(t)$, and
|
|
$c_i(t)$ --- are the first to reach a saturation regime and do so at
|
|
approximately the same $\sca$ because they are separated only by linear
|
|
transformations. Feature set $f_i(t)$ reaches a saturation regime, as well. But
|
|
contrary to previous results, the saturation point of $f_i(t)$ appears below
|
|
that of $c_i(t)$, which suggests that the second mechanism of thresholding and
|
|
temporal averaging can indeed improve intensity invariance beyond the first
|
|
mechanism of logarithmic compression and adaptation. The difference in
|
|
saturation points is best illustrated based on the ratio of each intensity
|
|
measure to the respective pure-noise reference
|
|
value~(Fig.\,\ref{fig:pipeline_full}d). However, compressing $f_i(t)$ into a
|
|
median across $k_i(t)$ conceils many kernel-specific details. It is therefore
|
|
necessary to consider the development of each $f_i(t)$ over $\sca$
|
|
separately~(Fig.\,\ref{fig:pipeline_full}c). Indeed, all 40 $f_i(t)$ in the set
|
|
reach a saturation regime for sufficiently large $\sca$. The saturated $\muf$
|
|
are distributed over a range of values --- which is the prerequisite for
|
|
forming species-specific combinations --- but are limited to a rather small
|
|
subset of possible values between 0 and 1. Based on previous
|
|
results~(Fig.\,\ref{fig:thresh-lp_single}f), this is likely due to the capping
|
|
of $\adapt(t)$ that prevents $f_i(t)$ from reaching its intrinsic saturation
|
|
value; but this cannot be confirmed until the following
|
|
analysis~(Fig.\,\ref{fig:pipeline_short}). Looking at the kernel-specific SNR
|
|
values of $c_i(t)$ over $\sca$~(Fig.\,\ref{fig:pipeline_full}e) and $f_i(t)$
|
|
over $\sca$~(Fig.\,\ref{fig:pipeline_full}f) reveals a high degree of variation
|
|
between different $k_i(t)$. Certain $f_i(t)$ achieve much higher SNR values
|
|
than $c_i(t)$ for the same $\sca$ due to the former's capacity for arbitrarily
|
|
low pure-noise responses ($\muf\to0$) and hence arbitrarily high SNR values.
|
|
Finally, the question remains whether the suspected improvement of intensity
|
|
invariance by $f_i(t)$ beyond $c_i(t)$ holds at the level of individual
|
|
$k_i(t)$. The single saturation points based on the median across $k_i(t)$ for
|
|
$c_i(t)$ and $f_i(t)$ are expanded into distributions of kernel-specific
|
|
saturation points~(Fig.\,\ref{fig:pipeline_full}g). For $c_i(t)$, the
|
|
distribution is rather narrow and corresponds well to the single saturation
|
|
point based on the median. For $f_i(t)$, however, the distribution is much
|
|
broader and is not centered around the single saturation point based on the
|
|
median but rather shifted towards lower $\sca$. Care must be taken when
|
|
interpreting the height of either distribution due to the logarithmic scaling
|
|
of the underlying $\sca$ axis. Nevertheless, the overall pattern suggests that
|
|
the saturation points of specific $f_i(t)$ are indeed lower than those of their
|
|
$c_i(t)$ counterparts. Therefore, the effect of thresholding and temporal
|
|
averaging on intensity invariance is not necessarily nullified by the previous
|
|
logarithmic compression and adaptation.
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_invariance_full_Omocestus_rufipes.pdf}
|
|
\caption{\textbf{Step-wise emergence of intensity-invariant song
|
|
representations along the model pathway.}
|
|
Input $\raw(t)$ consists of $\soc(t)$ scaled by $\sca$
|
|
with added $\noc(t)$ and is processed up to the feature
|
|
set $f_i(t)$ using kernel-specific threshold values
|
|
$\thr=2\cdot\sigma_{\eta_i}$ (appendix
|
|
Fig.\,\ref{fig:app_full_kern-sd}). Different color shades
|
|
indicate different types of Gabor kernels with specific
|
|
lobe number $\kn$ and either $+$ or $-$ sign, sorted (dark
|
|
to light) first by increasing $\kn$ and then by
|
|
sign~($1\,\leq\,\kn\,\leq\,4$; first $+$, then $-$ for
|
|
each $\kn$; five kernel widths $\kw$ of 1, 2, 4, 8, and
|
|
$16\,$ms per type; 8 types, 40 $k_i(t)$ in total).
|
|
\textbf{a}:~Examples of $\filt(t)$, $\env(t)$, $\db(t)$,
|
|
$\adapt(t)$, $c_i(t)$, and $f_i(t)$ for different $\sca$.
|
|
\textbf{b}:~Intensity measures over $\sca$. The median
|
|
over $k_i(t)$ is shown for $c_i(t)$ and $f_i(t)$. Dots
|
|
indicate $95\,\%$ curve span for $\db(t)$, $\adapt(t)$,
|
|
$c_i(t)$, and $f_i(t)$.
|
|
\textbf{c}:~Average value $\muf$ of each $f_i(t)$
|
|
over $\sca$.
|
|
\textbf{d}:~Ratio of intensity measures from \textbf{b} to
|
|
the respective pure-noise reference for $\sca=0$.
|
|
\textbf{e}:~Ratio of standard deviation $\sigma_{c_i}$ of
|
|
each $c_i(t)$.
|
|
\textbf{f}:~Ratio of $\muf$.
|
|
\textbf{g}:~Distributions of kernel-specific $\sca$ that
|
|
correspond to $95\,\%$ curve span for $c_i(t)$ and
|
|
$f_i(t)$. Dots indicate values based on the median from
|
|
\textbf{b}.
|
|
}
|
|
\label{fig:pipeline_full}
|
|
\end{figure}
|
|
\FloatBarrier
|
|
|
|
\subsubsection{Excluding logarithmic compression}
|
|
|
|
The previous analysis was repeated in exactly the same way as before, except
|
|
that the logarithmic compression of $\env(t)$, Eq.\,\ref{eq:log}, was skipped
|
|
in order to disable the first mechanism of intensity invariance. Consequently,
|
|
$\adapt(t)$ is merely a highpass filtered version of $\env(t)$; and $\db(t)$ is
|
|
missing entirely~(Fig.\,\ref{fig:pipeline_short}a). As expected, all
|
|
representations prior to the thresholding nonlinearity $\nl$ --- $\filt(t)$,
|
|
$\env(t)$, $\adapt(t)$, and $c_i(t)$ --- show a linear increase of the
|
|
intensity measure for larger $\sca$, while $f_i(t)$ is the only representation
|
|
to reach a saturation regime~(Fig.\,\ref{fig:pipeline_short}bd). The
|
|
saturated $\muf$ are distributed over a much broader range of values than in
|
|
the previous analysis~(Fig.\,\ref{fig:pipeline_short}c). Intriguingly, the
|
|
distribution of $\muf$ is symmetric around a value of 0.5. This is relevant
|
|
because every kernel $k^+(t)$ in the underlying kernel set has a counterpart of
|
|
opposite sign that is otherwise identical, so that $k^+(t)=-k^-(t)$. The
|
|
responses of $k^+(t)$ and $k^-(t)$ to the same input $\adapt(t)$ are also
|
|
inverted because convolution is a linear operation: $c^+(t)=-c^-(t)$. The
|
|
distributions of $c^+(t)$ and $c^-(t)$ are hence inverted to each other, as
|
|
well: $p(c^+)=p(-c^-)$. Based on Eq.\,\ref{eq:feat_prop}, transforming $c^+(t)$
|
|
and $c^-(t)$ further using the same $\Theta$ thus results in two complementary
|
|
features $f^+(t)$ and $f^-(t)$ that are symmetric around 0.5, so that
|
|
$f^+(t)=1-f^-(t)$. Of course, this symmetry throughout the feature
|
|
representation goes hand in hand with a substantial degree of redundancy and is
|
|
hardly expected to be present in the actual grasshopper auditory system. But
|
|
the fact that the saturated $\muf$ are distributed symmetrically around 0.5
|
|
provides concrete evidence that each $f_i(t)$ is able to reach its intrinsic
|
|
saturation level in the absence of logarithmic
|
|
compression~(Fig.\,\ref{fig:pipeline_short}c), which is otherwise prevented by
|
|
the capping of $\adapt(t)$, as seen during previous
|
|
analyses~(Fig.\,\ref{fig:thresh-lp_single}f and
|
|
Fig.\,\ref{fig:pipeline_full}c). Otherwise, there appear to be no major
|
|
differences in the development of $f_i(t)$ over $\sca$ compared to the previous
|
|
analysis, neither on the kernel-specific SNR
|
|
values~(Fig.\,\ref{fig:pipeline_short}e) nor on the distribution of
|
|
kernel-specific saturation points~(Fig.\,\ref{fig:pipeline_short}f). Overall,
|
|
the most substantial consequence of skipping the logarithmic compression is
|
|
that it allows $f_i(t)$ to reach its intrinsic saturation value. If this
|
|
results in a wider range of $\muf$ across the feature set, it should be
|
|
benefitial for forming species-specific combinations. However, this depends on
|
|
multiple different factors such as the choice of $k_i(t)$ and $\thr$ as well as
|
|
the structure and distribution of the specific song and is hence not guaranteed
|
|
simply by disabling logarithmic compression.
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_invariance_short_Omocestus_rufipes.pdf}
|
|
\caption{\textbf{Effects of disabling logarithmic compression on intensity
|
|
invariance along the model pathway.}
|
|
Same input and processing as in
|
|
Fig.\,\ref{fig:pipeline_full}, using kernel-specific
|
|
threshold values $\thr=2\cdot\sigma_{\eta_i}$ (appendix
|
|
Fig.\,\ref{fig:app_short_kern-sd}), except that
|
|
logarithmic compression and hence $\db(t)$ are skipped.
|
|
\textbf{a}:~Examples of $\filt(t)$, $\env(t)$,
|
|
$\adapt(t)$, $c_i(t)$, and $f_i(t)$ for different $\sca$.
|
|
\textbf{b}:~Intensity measures over $\sca$. The median
|
|
over $k_i(t)$ is shown for $c_i(t)$ and $f_i(t)$. Dot
|
|
indicates $95\,\%$ curve span for $f_i(t)$.
|
|
\textbf{c}:~Average value $\muf$ of each $f_i(t)$
|
|
over $\sca$.
|
|
\textbf{d}:~Ratio of intensity measures from \textbf{b} to
|
|
the respective pure-noise reference for $\sca=0$.
|
|
\textbf{e}:~Ratio of $\muf$.
|
|
\textbf{f}:~Distribution of kernel-specific $\sca$ that
|
|
correspond to $95\,\%$ curve span for $f_i(t)$. Dot
|
|
indicates value based on the median from \textbf{b}.
|
|
}
|
|
\label{fig:pipeline_short}
|
|
\end{figure}
|
|
\FloatBarrier
|
|
|
|
\subsubsection{Intensity invariance in a naturalistic setting}
|
|
|
|
% This one appears...meh?
|
|
So far, the analyses on intensity invariance were based on synthetically
|
|
generated input signals, since these allow for a systematic manipulation of the
|
|
mixture of song component $\soc(t)$ and noise component $\noc(t)$ over an
|
|
arbitrary range of scales $\sca$. Now, the question remains how the model
|
|
pathway performs under more naturalistic conditions. The previous analysis of
|
|
the full model pathway~(Fig.\,\ref{fig:pipeline_full}) was hence repeated,
|
|
using field recordings of a song of \textit{P. parallelus} as input $\raw(t)$
|
|
and a segment of background noise from the same recordings as pure-noise
|
|
reference. Recordings were taken simultaneously at eight different distances
|
|
$d$ from the sender, ranging from $10\,$cm to $220\,$cm with intervals of
|
|
$30\,$cm between microphones. The precise value of $\sca$ that corresponds to a
|
|
given $d$ cannot be determined in a straightforward manner, but $\sca$ is
|
|
expected to be inversely proportional to $d$ based on the inverse-square law of
|
|
sound propagation. All intensity measures and ratios thereof were hence plotted
|
|
over $1/d$ on a double-logarithmic scale, which is insofar comparable to
|
|
previous analyses that a decade on the $1/d$ axis corresponds to a decade on
|
|
the $\sca$ axis. To complicate matters further, the $1/d$ axis is sampled too
|
|
sparsely to determine saturation points as before based on the $95\,\%$ curve
|
|
span. Instead, one has to rely on the slope of the curve to assess if, and at
|
|
which $1/d$, a given representation reaches a saturation regime. Bearing these
|
|
limitations in mind, the intensity measures of each representation over
|
|
$1/d$~(Fig.\,\ref{fig:pipeline_field}b) follow a pattern that is consistent
|
|
with the results of the previous simulation-based
|
|
analysis~(Fig.\,\ref{fig:pipeline_full}b): The standard deviations of
|
|
$\filt(t)$ and $\env(t)$ increase linearly with $1/d$, respectively. The
|
|
standard deviations of $\db(t)$, $\adapt(t)$, and $c_i(t)$ show a weaker
|
|
increase with $1/d$ and appear to approach, but not reach, a saturation regime
|
|
for larger $1/d$. The average feature values $\muf$ of $f_i(t)$ show an even
|
|
weaker increase with $1/d$ and appear to reach a saturation regime for
|
|
$d=40\,$cm and $d=10\,$cm, which is consistent across most $f_i(t)$ in the
|
|
set~(Fig.\,\ref{fig:pipeline_field}c). Saturation of $f_i(t)$ without
|
|
saturation of $c_i(t)$ suggests that the input $\raw(t)$ at the smallest
|
|
$d=10\,$cm corresponds to a value of $\sca$ between 10 and 20 based on
|
|
comparison with the simulation-based analysis~(Fig.\,\ref{fig:pipeline_full}b).
|
|
The saturated $\muf$ are distributed over a comparably narrow range of values,
|
|
which could in parts be a property of the songs of \textit{P. parallelus}~(see
|
|
also Fig.\,\ref{fig:thresh-lp_species}bc). The ratios of each intensity measure
|
|
to the respective pure-noise reference value are not aligned across
|
|
representations~(Fig.\,\ref{fig:pipeline_field}d) or
|
|
kernels~(Fig.\,\ref{fig:pipeline_field}ef) but serve to consolidate the
|
|
previous observation that only $f_i(t)$ exhibits some degree of intensity
|
|
invariance within the available range of $1/d$. Based on the current results,
|
|
this intensity invariance of $f_i(t)$ in the field holds up to a distance of
|
|
around $40\,$cm from the sender, decays steadily between $40\,$cm and
|
|
$130\,$cm, and is substantially dimished for larger
|
|
distances~(Fig.\,\ref{fig:pipeline_field}a, bottom row).
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_invariance_field.pdf}
|
|
\caption{\textbf{Intensity invariance along the model pathway in a
|
|
naturalistic setting.}
|
|
Input $\raw(t)$ consists of a song of \textit{P.
|
|
parallelus} recorded in the field at eight different
|
|
distances $d$ and is processed up to the feature set
|
|
$f_i(t)$ using kernel-specific threshold values
|
|
$\thr=2\cdot\sigma_{\eta_i}$ (appendix
|
|
Fig.\,\ref{fig:app_field_kern-sd}). Different color shades
|
|
indicate different types of Gabor kernels with specific
|
|
lobe number $\kn$ and either $+$ or $-$ sign, sorted (dark
|
|
to light) first by increasing $\kn$ and then by
|
|
sign~($1\,\leq\,\kn\,\leq\,4$; first $+$, then $-$ for
|
|
each $\kn$; five kernel widths $\kw$ of 1, 2, 4, 8, and
|
|
$16\,$ms per type; 8 types, 40 $k_i(t)$ in total).
|
|
\textbf{a}:~$\filt(t)$, $\env(t)$, $\db(t)$, $\adapt(t)$,
|
|
$c_i(t)$, and $f_i(t)$ at each $d$. A noise segment from
|
|
the same recording is shown for reference.
|
|
\textbf{b}:~Intensity measures over $d$. The median over
|
|
$k_i(t)$ is shown for $c_i(t)$ and $f_i(t)$.
|
|
\textbf{c}:~Average value $\muf$ of each $f_i(t)$ over
|
|
$d$.
|
|
\textbf{d}:~Ratio of intensity measures from \textbf{b} to
|
|
the respective value obtained from the noise reference.
|
|
\textbf{e}:~Ratio of standard deviation $\sigma_{c_i}$ of
|
|
each $c_i(t)$.
|
|
\textbf{f}:~Ratios of $\muf$.
|
|
}
|
|
\label{fig:pipeline_field}
|
|
\end{figure}
|
|
\FloatBarrier
|
|
|
|
\subsection{Interspecific and intraspecific feature variability}
|
|
|
|
In the final analysis of the current study, we investigated the variability of
|
|
songs in the feature representation between different species and within the
|
|
same species~(Fig.\,\ref{fig:feat_cross_species}). Naturally, a feature
|
|
representation that is both consistent across different songs of the same
|
|
species and sufficiently different between songs of different species is a
|
|
fundamental prerequisite for species-specific song recognition. The data used
|
|
in this analysis corresponds to the saturated $\muf$ of each $f_i(t)$ from the
|
|
previous analysis of the full model pathway~(Fig.\,\ref{fig:pipeline_full}c),
|
|
using different songs of \textit{O. rufipes} for the intraspecific comparisons
|
|
and single songs from a number of species for the interspecific comparisons
|
|
(also shown in Fig.\,\ref{fig:thresh-lp_species}a). Accordingly, each song is
|
|
represented by 40 values of $\muf$ based on the same set of $f_i(t)$. For each
|
|
comparison, $\muf$ from one song was plotted against $\muf$ from the other
|
|
song, so that each dot within a subplot corresponds to a single feature
|
|
$f_i(t)$. For the intraspecific
|
|
comparisons~(Fig.\,\ref{fig:feat_cross_species}, upper triangular), the pairs
|
|
of $\muf$ are distributed closely around the diagonal, with a minimum
|
|
correlation coefficient of $\rho=0.82$, a maximum of $\rho=0.99$, and a median
|
|
of $\rho=0.91$. A given $f_i(t)$ thus tends to have a similar $\muf$ across
|
|
different songs of the same species. In contrast, the pairs of $\muf$ for the
|
|
interspecific comparisons~(Fig.\,\ref{fig:feat_cross_species}, lower
|
|
triangular) are distributed in a variety of different ways, most in broader
|
|
clouds (e.g. \textit{C. biguttulus} vs. \textit{C. mollis}) but some more
|
|
narrowly around the diagonal (e.g. \textit{P. parallelus} vs. \textit{C.
|
|
dispar}). The correlation coefficients $\rho$ vary widely between different
|
|
interspecific comparisons, with a minimum of $\rho=-0.1$, a maximum of
|
|
$\rho=0.91$, and a median of $\rho=0.40$. A given $f_i(t)$ therefore tends to
|
|
have a less similar $\muf$ across different species than within the same
|
|
species, although certain exeptions exist~(Fig.\,\ref{fig:feat_cross_species},
|
|
lower right). Accordingly, the feature representation that is generated by the
|
|
model pathway is, in principle, suitable for the distinction between different
|
|
species-specific songs. However, even the songs of the same species are subject
|
|
to considerable variability in various aspects and depending on a multitude of
|
|
external and internal factors, which cannot be fully captured based on a
|
|
limited number of songs. The results of the current analysis are hence to be
|
|
treated as a proof-of-concept that paves the way towards more comprehensive
|
|
investigations on the details of song representation in feature space,
|
|
including the effects of different parameters of the model pathway as well as
|
|
the inclusion of additional songs and species to reflect the complexity of
|
|
natural song variation.
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_features_cross_species.pdf}
|
|
\caption{\textbf{Interspecific and intraspecific feature variability.}
|
|
Average value $\muf$ of each feature $f_i(t)$ against its
|
|
counterpart from a 2nd feature set based on a different
|
|
input $\raw(t)$. Data is based on the saturated $\muf$
|
|
from Fig.\,\ref{fig:pipeline_full}. Each dot within a
|
|
subplot represents a single $f_i(t)$. Different color
|
|
shades indicate different types of Gabor kernels with
|
|
specific lobe number $\kn$ and either $+$ or $-$ sign,
|
|
sorted (dark to light) first by increasing $\kn$ and then
|
|
by sign~($1\,\leq\,\kn\,\leq\,4$; first $+$, then $-$ for
|
|
each $\kn$; five kernel widths $\kw$ of 1, 2, 4, 8, and
|
|
$16\,$ms per type; 8 types, 40 kernels in total).
|
|
\textbf{Lower triangular}:~Interspecific comparisons
|
|
between single songs of different species.
|
|
\textbf{Upper triangular}:~Intraspecific comparisons
|
|
between different songs of a single species (\textit{O.
|
|
rufipes}).
|
|
\textbf{Lower right}:~Distribution of correlation
|
|
coefficients $\rho$ for each interspecific and
|
|
intraspecific comparison. Dots indicate single $\rho$
|
|
values.\\
|
|
}
|
|
\label{fig:feat_cross_species}
|
|
\end{figure}
|
|
\FloatBarrier
|
|
|
|
\newpage
|
|
\section{Discussion}
|
|
|
|
In the current study, we have established a physiologically inspired functional
|
|
model of the grasshopper song recognition pathway. The model pathway covers the
|
|
entire auditory processing stream, from the sound reception at the tympanal
|
|
membrane over peripheral receptor neurons and local interneurons up to the
|
|
generation of a high-dimensional feature representation at the level of the
|
|
ascending neurons and beyond in the SEG. Using this model pathway, we have
|
|
identified two computational key mechanisms for the emergence of
|
|
intensity-invariant song representations. Each mechanism comprises a nonlinear
|
|
transformation and a subsequent linear transformation. The first mechanism
|
|
consists of logarithmic compression and adaptation, which takes place at the
|
|
level of the receptor neurons and local interneurons. The second mechanism
|
|
consists of thresholding and temporal averaging, which takes place either at
|
|
the level of the ascending neurons or further downstream in the SEG. Systematic
|
|
investigation of both mechanisms revealed a persistent trade-off between the
|
|
intensity invariance and the SNR of the song representations along the pathway.
|
|
In the following, we discuss the capabilities and limitations of our model
|
|
approach as well as the implications of our findings for the design of the
|
|
grasshopper auditory system, the evolution of species-specific grasshopper
|
|
songs, and the ethological relevance of intensity invariance in a natural
|
|
acoustic environment.
|
|
|
|
\subsection{Leveraging functional modelling to investigate sensory systems}
|
|
|
|
Our understanding of sensory processing systems is based on the distributed
|
|
accumulation of anatomical, physiological, and ethological evidence. Functional
|
|
modelling provides a powerful tool to integrate the available fragments into a
|
|
coherent whole. It fasciliates systematic, reproducible investigations of
|
|
relevant parameters such as scale $\sca$ or threshold value $\thr$. Moreover,
|
|
it allows to address questions of broader scope by generalizing from concrete
|
|
evidence. For instance, the interaction between the two mechanisms of intensity
|
|
invariance is most assessible if both mechanisms can be treated as consecutive
|
|
stages along the pathway --- where the output of the first stage relates
|
|
directly to the input of the second stage --- rather than separate entities.
|
|
The model pathway also provides a general basis for comparing song
|
|
representations across different species without the need for species-specific
|
|
models. However, the potential of functional modelling for research on sensory
|
|
systems depends entirely on the amount of available knowledge about the system.
|
|
The grasshopper song recognition pathway is a comparably simple and very
|
|
well-understood system and is therefore a particularly suitable candidate for
|
|
functional modelling. Other sensory systems that are either more complex or
|
|
have not been subject to decades of study will likely not be suitable for this
|
|
approach yet.
|
|
|
|
\subsection{Feature representation, temporal averaging, and song design}
|
|
\label{sec:constant_feat}
|
|
|
|
The feature set is the final song representation along the model pathway and
|
|
constitutes the basis for song recognition. Each feature $f_i(t)$ results from
|
|
the thresholding of the respective kernel response $c_i(t)$ by $\nl$ and the
|
|
subsequent temporal averaging of binary response $b_i(t)$ by a lowpass filter
|
|
with extremely low cutoff frequency $\fc$. At a given time point $t$, $f_i(t)$
|
|
approximately quantifies the proportion of time during which $c_i(t)$ exceeds
|
|
the threshold value $\thr$ within the averaging interval $\tlp$ specified by
|
|
$\fc$. The value of $f_i(t)$ is hence determined by $\thr$ with respect to the
|
|
distribution $\pci$ of $c_i(t)$ and is restricted to the interval $[0,1]$.
|
|
|
|
Different species-specific songs are represented by different combinations of
|
|
feature values, which should preferably be constant for the duration of a song
|
|
to enable reliable recognition. The fundamental requirement for a constant
|
|
$f_i(t)$ is that the time where $c_i(t)>\thr$ during $\tlp$ is the same for all
|
|
$t$, which is fulfilled if $\pci$ is stable across $t$. The most
|
|
straightforward way to achieve a stable $\pci$ is that $c_i(t)$ is periodic and
|
|
$\tlp$ is sufficiently long to average over multiple cycles of $c_i(t)$.
|
|
Song-evoked $c_i(t)$ are indeed approximately periodic, which is largely an
|
|
inherited property of the song itself. Most grasshopper songs are produced by
|
|
stridulation, which refers to the pulling of the serrated stridulatory file on
|
|
the hindlegs across a resonating vein on the
|
|
forewings~(\bcite{helversen1977stridulatory}; \bcite{stumpner1994song};
|
|
\bcite{helversen1997recognition}). Every "tooth" that strikes the vein
|
|
generates a brief sound pulse; multiple pulses make up a syllable; and the
|
|
repetition of syllables and pauses results in a pattern with a high degree of
|
|
temporal regularity. Accordingly, a robust feature representation in the sense
|
|
of constant $f_i(t)$ is tightly linked to the mechanism of sound production and
|
|
the temporal structure of the generated song.
|
|
|
|
Various grasshopper species, especially those with longer songs like \textit{C.
|
|
mollis}, \textit{G. rufus}, or \textit{O. rufipes}, tend to stridulate softly
|
|
at first and then continuously increase the amplitude of their song over time.
|
|
This slow "ramping" amplitude modulation makes the overall song less periodic
|
|
despite its temporal regularity. The "ramping" appears more pronounced in
|
|
$\env(t)$ compared to $\adapt(t)$, which suggests that the logarithmic
|
|
compression and adaptation during the preprocessing stage might be at least
|
|
partially beneficial for mitigating the effect of this amplitude modulation on
|
|
later representations. However, the adaptation of $\adapt(t)$ can only act on
|
|
certain time scales --- depending on the cutoff frequency of the underlying
|
|
highpass filter --- and is hence not able to compensate for "ramping" across
|
|
the entire duration of a song.
|
|
|
|
Certain grasshopper species like \textit{Chorthippus dorsatus} are known to
|
|
switch their stridulation pattern in the middle of a
|
|
song~(\bcite{stumpner1994song}). \textit{C. dorsatus} starts stridulating with
|
|
both hindlegs in synchrony and thereby generates a pronounced syllable-pause
|
|
pattern similar to that of \textit{P. parallelus}. For the last part of its
|
|
song, however, \textit{C. dorsatus} switches to an alternating leg movement,
|
|
which results in a more continuous but not entirely unstructured rattling
|
|
sound. It is unclear what this composite design means for the feature
|
|
representation of \textit{C. dorsatus} songs. In principle, both parts of the
|
|
song could result in similar $\pci$ despite their different temporal structure,
|
|
which would allow for consistent $f_i(t)$ across the entire song. However, it
|
|
appears more likely that only one part of the song encodes species identity,
|
|
while the other part serves a different purpose such as fitness
|
|
advertisement~(SOURCE?).
|
|
|
|
Finally, the question remains how the choice of an appropriate averaging
|
|
interval $\tlp$ depends on the duration and temporal structure of a song. The
|
|
minimum $\tlp$ should encompass at least a few cycles of $c_i(t)$ to ensure a
|
|
stable $\pci$ and hence a constant $f_i(t)$. The maximum $\tlp$ should not
|
|
exceed the duration of a song to avoid the inclusion of behaviorally irrelevant
|
|
information. The longer $\tlp$, the longer $f_i(t)$ takes to stabilize after
|
|
the onset and before the offset of a song, which narrows the time window for
|
|
reliable recognition. The duration of species-specific grasshopper songs can
|
|
range from a few hundred milliseconds (e\,.g \textit{Stethophyma grossum}) to
|
|
well over a minute (e\,.g. \textit{C. mollis}), so that the optimal $\tlp$ is
|
|
likely to differ between species.
|
|
|
|
\subsection{Sensory invariances in the grasshopper auditory system}
|
|
|
|
The notion of invariance is fundamental for sensory processing systems.
|
|
Invariance, in the general sense, can be described as the property of a
|
|
transformation to maintain variation across certain meaningful input parameters
|
|
in its output while discarding variation across other input parameters. This
|
|
boils down to a selective input-output decorrelation that allows the system to
|
|
represent only those aspects of the stimulus that are behaviorally relevant to
|
|
the organism.
|
|
|
|
The grasshopper auditory system has to deal with a number of sources of
|
|
non-informative song variation. For instance, the temporal structure of the
|
|
song pattern warps with temperature~(\bcite{skovmand1983song}). This also
|
|
affects certain structural parameters that are essential for song recognition,
|
|
mainly the duration of syllables and pauses. The auditory system can compensate
|
|
for this variation by reading out relative temporal relationships rather than
|
|
absolute time intervals~(\bcite{creutzig2009timescale};
|
|
\bcite{creutzig2010timescale}). The ratio of syllable duration to pause
|
|
duration is relatively constant across temperatures and has been shown to be
|
|
suitable for song recognition~(\bcite{helversen1972gesang}), so that there is
|
|
likely no need to retain any information about the absolute duration of
|
|
syllables and pauses.
|
|
|
|
The situation is more complex for variations in song intensity. Song intensity
|
|
at the receiver's position depends mostly on the distance to the sender and is
|
|
hence not a reliable cue to infer species identity. The auditory system should
|
|
therefore be invariant to intensity variations to recognize conspecific songs
|
|
regardless of sender distance. However, song intensity --- specifically, the
|
|
interaural intensity difference --- is also required for directional hearing,
|
|
which is essential for phonotaxis~(\bcite{helversen1988interaural}). Conflicts
|
|
between song recognition and directional hearing are avoided in the auditory
|
|
system by distributing both functions across two parallel
|
|
pathways~(\bcite{helversen1984parallel}; \bcite{ronacher1986routes}). This is
|
|
the main reason why our model pathway is focused entirely on song recognition
|
|
and has no capacity for directional hearing, no matter how relevant it may be
|
|
to the grasshopper.
|
|
|
|
Furthermore, "invariance to variations in song intensity" does not do justice
|
|
to the full extent of the problem. Intensity is a function of song amplitude
|
|
within a certain time frame. It can refer to the individual syllables and
|
|
pauses of the song pattern as well as the entire song --- the former is
|
|
relevant for song recognition, while the latter is not. Intensity invariance in
|
|
the current context can therefore be described as time scale-selective
|
|
sensitivity to the faster amplitude dynamics of the song pattern and
|
|
simultaneous insensitivity to slower, more sustained amplitude dynamics. In the
|
|
model pathway, this time scale selectivity is reflected by the cutoff frequency
|
|
$\fc$ of the highpass filter that underlies the adaptation of $\adapt(t)$: Most
|
|
$\fc$ are effective in removing the local offset of $\db(t)$ and render
|
|
$\adapt(t)$ intensity-invariant, but only sufficiently low $\fc$ will leave the
|
|
relevant amplitude dynamics of the song pattern intact.
|
|
|
|
\subsection{Intensity invariance versus SNR}
|
|
|
|
Each processing step along the model pathway is a transformation between input
|
|
representation and output representation. The intensity of the input is
|
|
characterized by scale $\sca$. The intensity of the output is characterized by
|
|
an appropriate intensity measure. If the transformation renders the output more
|
|
intensity-invariant, then the intensity measure will saturate for sufficiently
|
|
large $\sca$, which caps the output SNR to a constant value across these
|
|
$\sca$. Otherwise, the intensity measure and hence the output SNR will increase
|
|
monotonically with $\sca$. The trade-off between intensity invariance and SNR
|
|
refers to the principle that a transformation can either improve intensity
|
|
invariance or maintain SNR --- it cannot do both at the same time. This
|
|
principle is presumably not specific to the two mechanisms along the model
|
|
pathway but rather a general property of transformations that equalize between
|
|
different input intensities.
|
|
|
|
Logarithmic compression and adaptation by highpass filtering is capable of
|
|
equalizing a wide range of $\sca$. In the absence of noise component $\noc(t)$,
|
|
output $\adapt(t)$ is a perfectly intensity-invariant representation of song
|
|
component $\soc(t)$ across all $\sca>0$. However, the presence of $\noc(t)$
|
|
limits the effectiveness of this mechanism to sufficiently large $\sca$. This
|
|
means that intensity invariance and SNR interact at the input level, as well.
|
|
Specifically, the saturation point of $\adapt(t)$ is determined by the input
|
|
SNR of $\env(t)$, which in turn depends on the initial SNR of the sound signal
|
|
$\raw(t)$. This initial SNR is presumably improved by the bandpass filtering of
|
|
$\raw(t)$ into $\filt(t)$ at the tympanal membrane, which attenuates
|
|
frequencies outside the relevant range of grasshopper songs. The SNR is then
|
|
further improved by the rectification and lowpass filtering of $\filt(t)$ into
|
|
$\env(t)$. This improvement depends on the cutoff frequency $\fc$ of the
|
|
lowpass filter --- the lower $\fc$, the higher the SNR of $\env(t)$ at a given
|
|
$\sca$. However, $\fc$ must not be too low to avoid the attenuation of relevant
|
|
amplitude dynamics of the song pattern. The saturation level of $\adapt$,
|
|
unlike its saturation point, is independent of the SNR of $\env(t)$ because the
|
|
influence of $\noc(t)$ is negligible for sufficiently large $\sca$. The output
|
|
SNR of $\adapt(t)$ saturates at a comparably low value of around 10. This might
|
|
in parts be a consequence of the logarithm, which compresses different higher
|
|
intensities but also amplifies lower intensities, including the noise floor.
|
|
Both the saturation level and the saturation point of $\adapt(t)$ vary between
|
|
different species and individual songs. These differences are likely rooted in
|
|
the way in which logarithmic compression acts on the specific distribution of
|
|
$\env(t)$, which is determined by $\fc$ as well as the temporal structure and
|
|
frequency spectrum of the rectified $\filt(t)$.
|
|
|
|
Thresholding and temporal averaging renders feature $f_i(t)$
|
|
intensity-invariant for sufficiently large $\sca$. The trade-off between
|
|
intensity invariance and SNR is mediated by threshold value $\thr$. A lower
|
|
$\thr$ ($\thr\to0$) improves intensity invariance by shifting the saturation
|
|
point towards lower $\sca$ but also decreases the SNR of $f_i(t)$. The
|
|
saturation level of $f_i(t)$ is independent of $\thr$ as long as the intensity
|
|
invariance by the previous mechanism is neglected. The SNR of $f_i(t)$ is
|
|
therefore determined solely by the pure-noise response of $f_i(t)$. The
|
|
distribution $\pci$ of the pure-noise kernel response $c_i(t)$ is largely a
|
|
normal distribution with mean $\mu\approx0$ for all kernels $k_i(t)$. The value
|
|
of the pure-noise $f_i(t)$ is hence 0.5 for $\thr=0$ and decreases for higher
|
|
$\thr$. If $\thr$ is set above the maximum of $c_i(t)$, the pure-noise feature
|
|
value is 0, which results in an "unlimited" SNR of $f_i(t)$. In this case, any
|
|
non-zero feature value that is sustained for a sufficient duration could serve
|
|
as indicator for the presence of $\soc(t)$, although at the cost of a higher
|
|
saturation point. The maximum of the pure-noise $c_i(t)$ is assumed to be very
|
|
small due to the various SNR improvements along the pathway, so that the
|
|
required increase in $\thr$ and hence the saturation point of $f_i(t)$ is not
|
|
expected to be substantial. However, exploiting the capacity of $f_i(t)$ for
|
|
arbitrarily high SNR would certainly require a fine evolutionary tuning of
|
|
$\thr$ to the properties of both the species-specific song and the natural
|
|
noise in a certain habitat.
|
|
|
|
\newpage
|
|
\subsection{Intensity invariance versus intensity invariance}
|
|
|
|
Two consecutive mechanisms of intensity invariance do not necessarily add up to
|
|
a stronger overall intensity invariance. If the first mechanism results in a
|
|
lower saturation point than the second mechanism by itself, the saturation
|
|
point of feature $f_i(t)$ will be determined solely by the first mechanism. In
|
|
this case, the saturation level of $f_i(t)$ will conform to the intensity that
|
|
$f_i(t)$ can reach for the given saturation point rather than the intrinsic
|
|
saturation level of $f_i(t)$. Conversely, if the second mechanism results in a
|
|
lower saturation point than the first mechanism, both the saturation point and
|
|
the saturation level of $f_i(t)$ will be determined by the second mechanism.
|
|
The saturation points of $f_i(t)$ across the set are distributed over a much
|
|
wider range than those of the preceeding kernel responses $c_i(t)$, which
|
|
suggests that the interaction between the two mechanisms is specific to
|
|
individual kernels $k_i(t)$. A number of $f_i(t)$ achieve a lower saturation
|
|
point than the respective $c_i(t)$, while some $f_i(t)$ exhibit similar or only
|
|
marginally lower saturation points. This raises the question whether two
|
|
consecutive mechanisms of intensity invariance are actually beneficial for the
|
|
overall system.
|
|
|
|
From a purely functional perspective, the answer could be that logarithmic
|
|
compression and adaptation is a necessary preprocessing step towards a robust
|
|
feature representation, even if thresholding and temporal averaging alone would
|
|
be sufficient to render $f_i(t)$ intensity-invariant. This preprocessing likely
|
|
improves the temporal regularity of the song pattern in $\adapt(t)$ and
|
|
$c_i(t)$, which is required for constant $f_i(t)$ across the duration of a
|
|
song~(Section\,\ref{sec:constant_feat}). It also ensures consistency between
|
|
the distribution $\pci$ of $c_i(t)$ across songs of different intensity, which
|
|
is essential for the generation of consistent species-specific $f_i(t)$ under a
|
|
static $\thr$. From a physiological perspective, the answer is likely that
|
|
neurons possess only a limited firing rate for encoding stimulus intensities
|
|
that can range over several orders of magnitude. Sigmoidal tuning curves over
|
|
logarithmically compressed stimulus intensities are a common property of
|
|
sensory neurons across various modalities~(SOURCE?), and neurons of the
|
|
grasshopper auditory system are no exception~(\bcite{suga1960peripheral};
|
|
\bcite{gollisch2002energy}).
|
|
|
|
\subsection{Implications for behavior in a natural acoustic environment}
|
|
|
|
Most grasshoppers live in environments that are communally inhabited by
|
|
numerous individuals from multiple species. Their acoustic environment is
|
|
characterized by noise from various sources --- abiotic ones like wind and
|
|
water, but also the songs of both hetero- and conspecifics. This limits the SNR
|
|
that each individual can achieve for its own song, and hence the effectiveness
|
|
of the intensity-invariant processing in the auditory system. Producing higher
|
|
song intensities is not a viable solution to this problem, because these also
|
|
contribute to the overall noise floor. A possible behavioral solution could be
|
|
to produce songs in a "turn-taking" manner to avoid the temporal superposition
|
|
of multiple songs into overly intense signals. This would also prevent the
|
|
mutual distortion of the respective song pattern. Another solution could be to
|
|
spatially separate from other nearby grasshoppers to spread the potential noise
|
|
sources over a larger area. However, according to our analysis based on field
|
|
recordings as well as previous work on the topic~(\bcite{lang2000acoustic}),
|
|
reliable song recognition is limited to little more than 1\,m from the sender,
|
|
so that a grasshopper also cannot afford to stay too far away from its
|
|
conspecifics. A better solution may hence be to collectively produce songs at
|
|
lower-than-possible intensities, which would reduce the overall noise floor for
|
|
all nearby individuals. Importantly, the limitation of intensity invariance by
|
|
SNR likely applies to all grasshoppers regardless of species, so that the
|
|
behavioral strategies could be shared among the species that coexist in a given
|
|
habitat.
|
|
|
|
% Because the presumed restriction of song recognition
|
|
% by means of the noise floor applies to all grasshoppers in a certain area,
|
|
% these strategies may not be specific to some of the species at this location.
|
|
% Instead, they must be shared by all grasshopper species that coexist within a
|
|
% portion of a given habitat, which would provide an important implication for
|
|
% the evolution of grasshopper songs in communities of multiple species.
|
|
|
|
%%% RELICS OF INTRODUCTION %%%
|
|
% - Nonlinear operations can be used to detach representations from graded physical
|
|
% stimulus (to fasciliate categorical behavioral decision-making?):\\
|
|
% 1) Capture sufficiently precise amplitude information: $\env(t)$, $\adapt(t)$\\
|
|
% $\rightarrow$ Closely following the AM of the acoustic stimulus\\
|
|
% 2) Quantify relevant stimulus properties on a graded scale: $c_i(t)$\\
|
|
% $\rightarrow$ More decorrelated representation, compared to prior stages\\
|
|
% 3) Nonlinearity: Distinguish between "relevant vs irrelevant" values: $b_i(t)$\\
|
|
% $\rightarrow$ Trading a graded scale for two or more categorical states\\
|
|
% 4) Represent stimulus properties under relevance constraint: $f_i(t)$\\
|
|
% $\rightarrow$ Graded again but highly decorrelated from the acoustic stimulus\\
|
|
% 5) Categorical behavioral decision-making requires further nonlinearities\\
|
|
% $\rightarrow$ Parameters of a behavioral response may be graded (e.g. approach speed),
|
|
% initiation of one behavior over another is categorical (e.g. approach/stay)
|
|
|
|
% Multi-species, multi-individual communally inhabited environments\\
|
|
% - Temporal overlap: Simultaneous singing across individuals/species common\\
|
|
% - Frequency overlap: Little speciation into frequency bands (likely unused)\\
|
|
% - "Biotic noise": Hetero-/conspecifics ("Another one's songs are my noise")\\
|
|
% - "Abiotic noise": Wind, water, vegetation, anthropogenic\\
|
|
% - Effects of habitat structure on sound propagation (landscape - soundscape)\\
|
|
% $\rightarrow$ Sensory constraints imposed by the (acoustic) environment
|
|
|
|
% Cluster of auditory challenges (interlocking constraints $\rightarrow$ tight coupling):\\
|
|
% From continuous acoustic input, generate neuronal representations that...\\
|
|
% 1)...allow for the separation of relevant (song) events from ambient noise floor\\
|
|
% 2)...compensate for behaviorally non-informative song variability (invariances)\\
|
|
% 3)...carry sufficient information to characterize different song patterns,
|
|
% recognize the ones produced by conspecifics, and make appropriate behavioral
|
|
% decisions based on context (sender identity, song type, mate/rival quality)
|
|
|
|
% How can a human observer conceive a grasshopper's auditory percepts?\\
|
|
% - How to investigate the workings of the auditory pathway as a whole?\\
|
|
% - How to systematically test effects and interactions of processing parameters?\\
|
|
% - How to integrate the available knowledge on anatomy, physiology, ethology?\\
|
|
% $\rightarrow$ Abstract, simplify, formalize $\rightarrow$ Functional model framework
|
|
|
|
\newpage
|
|
\section{Appendix}
|
|
|
|
% Not sure if we really need this one. Might raise more questions than it
|
|
% provides answers. The noise component is not stable throughout nonlinear
|
|
% transformations, that is all the reader needs to know, i believe.
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_noise_env_sd_conversion_appendix.pdf}
|
|
\caption{\textbf{Conversion of the noise component by envelope extraction.}
|
|
Standard deviation $\sigma_{\eta}$ of noise component
|
|
$\noc(t)$ within the signal envelope $\env(t)$ over scale
|
|
$\sca$. Based on input $\raw(t)$ with $\sigma_{\eta}=1$
|
|
(corresponding to the analysis underlying
|
|
Fig.\,\ref{fig:rect-lp}), using 100 random realizations of
|
|
$\noc(t)$.}
|
|
\label{fig:app_env-sd}
|
|
\end{figure}% Referenced.
|
|
\FloatBarrier
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_invariance_rect-lp_appendix.pdf}
|
|
\caption{\textbf{Species-specific data underlying Fig.\,\ref{fig:rect-lp}e.}
|
|
Ratio of the standard deviation $\sigma_{\text{env}}$ to
|
|
the pure-noise reference $\sigma_{\eta}$ of the signal
|
|
envelope $\env(t)$ over scale $\sca$ for different cutoff
|
|
frequencies $\fc$ of the lowpass filter extracting
|
|
$\env(t)$. Solid lines and shaded areas indicate mean
|
|
$\pm$ standard deviation across songs per recording.
|
|
Dashed lines indicate mean across recordings (shown in
|
|
Fig.\,\ref{fig:rect-lp}e).}
|
|
\label{fig:app_rect-lp}
|
|
\end{figure}% Referenced.
|
|
\FloatBarrier
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_invariance_log-hp_appendix.pdf}
|
|
\caption{\textbf{Species-specific data underlying Fig.\,\ref{fig:log-hp}e.}
|
|
Ratio of the standard deviation $\sigma_{\text{adapt}}$ to
|
|
the pure-noise reference $\sigma_{\eta}$ of the
|
|
intensity-adapted envelope $\adapt(t)$ over scale $\sca$.
|
|
Solid lines and shaded areas indicate mean $\pm$ standard
|
|
deviation across songs per recording. Dashed lines
|
|
indicate mean across recordings (shown in
|
|
Fig.\,\ref{fig:log-hp}e).}
|
|
\label{fig:app_log-hp_curves}
|
|
\end{figure}% Referenced.
|
|
\FloatBarrier
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_saturation_log-hp_appendix.pdf}
|
|
\caption{\textbf{Species-specific saturation points underlying
|
|
Fig.\,\ref{fig:log-hp}e.}
|
|
Distribution of saturation points ($95\,\%$ curve span) of
|
|
ratio $\sigma_{\text{adapt}} / \sigma_{\eta}$ of the
|
|
intensity-adapted envelope $\adapt(t)$ over scale $\sca$
|
|
across all available songs. Dots indicate the saturation
|
|
point of the mean curve across songs and recordings (shown
|
|
in Fig.\,\ref{fig:log-hp}e, see also appendix
|
|
Fig.\,\ref{fig:app_log-hp_curves}).}
|
|
\label{fig:app_log-hp_saturation}
|
|
\end{figure}% Referenced.
|
|
\FloatBarrier
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_invariance_thresh-lp_pure_appendix.pdf}
|
|
\caption{\textbf{Species-specific data underlying Fig.\,\ref{fig:thresh-lp_species}bd.}
|
|
Average value $\muf$ of each of the three features
|
|
$f_i(t)$ over scale $\sca$ in the noiseless case. Solid
|
|
lines and shaded areas indicate mean $\pm$ standard
|
|
deviation across songs per recording. Dashed lines
|
|
indicate mean across recordings (shown in
|
|
Fig.\,\ref{fig:thresh-lp_species}bd).}
|
|
\label{fig:app_thresh-lp_pure}
|
|
\end{figure}% Referenced.
|
|
\FloatBarrier
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_invariance_thresh-lp_noise_appendix.pdf}
|
|
\caption{\textbf{Species-specific data underlying Fig.\,\ref{fig:thresh-lp_species}ce.}
|
|
Average value $\muf$ of each of the three features
|
|
$f_i(t)$ over scale $\sca$ in the noisy case. Solid lines
|
|
and shaded areas indicate mean $\pm$ standard deviation
|
|
across songs per recording. Dashed lines indicate mean
|
|
across recordings (shown in
|
|
Fig.\,\ref{fig:thresh-lp_species}ce).}
|
|
\label{fig:app_thresh-lp_noise}
|
|
\end{figure}% Referenced.
|
|
\FloatBarrier
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_kernel_sd_perc_thresh_lp_appendix.pdf}
|
|
\caption{\textbf{Relation between threshold value and pure-noise feature
|
|
value for Fig.\,\ref{fig:thresh-lp_single} and
|
|
Fig.\,\ref{fig:thresh-lp_species}.}
|
|
Proportion of pure-noise kernel response $c_i(t)$ that
|
|
exceeds threshold value $\thr$ --- which determines the
|
|
average value $\muf$ of feature $f_i(t)$ --- over $\thr$
|
|
in multiples of standard deviation $\sigma_{c_i}$.
|
|
Corresponds to a "reverse" cumulative distribution
|
|
function of $c_i(t)$. Black solid lines indicate rCDF per
|
|
kernel $k_i(t)$. Red dashed line indicates rCDF for a
|
|
normal distribution with $\mu=0$ and $\sigma=1$.
|
|
}
|
|
\label{fig:app_thresh-lp_kern-sd}
|
|
\end{figure}% Referenced.
|
|
\FloatBarrier
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_kernel_sd_perc_full_appendix.pdf}
|
|
\caption{\textbf{Relation between threshold value and pure-noise feature
|
|
value for Fig.\,\ref{fig:pipeline_full}.}
|
|
Proportion of pure-noise kernel response $c_i(t)$ that
|
|
exceeds threshold value $\thr$ --- which determines the
|
|
average value $\muf$ of feature $f_i(t)$ --- over $\thr$
|
|
in multiples of standard deviation $\sigma_{c_i}$.
|
|
Corresponds to a "reverse" cumulative distribution
|
|
function of $c_i(t)$. Black solid lines indicate rCDF per
|
|
kernel $k_i(t)$. Red dashed line indicates rCDF for a
|
|
normal distribution with $\mu=0$ and $\sigma=1$.
|
|
}
|
|
\label{fig:app_full_kern-sd}
|
|
\end{figure}% Referenced.
|
|
\FloatBarrier
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_kernel_sd_perc_short_appendix.pdf}
|
|
\caption{\textbf{Relation between threshold value and pure-noise feature
|
|
value for Fig.\,\ref{fig:pipeline_short}.}
|
|
Proportion of pure-noise kernel response $c_i(t)$ that
|
|
exceeds threshold value $\thr$ --- which determines the
|
|
average value $\muf$ of feature $f_i(t)$ --- over $\thr$
|
|
in multiples of standard deviation $\sigma_{c_i}$.
|
|
Corresponds to a "reverse" cumulative distribution
|
|
function of $c_i(t)$. Black solid lines indicate rCDF per
|
|
kernel $k_i(t)$. Red dashed line indicates rCDF for a
|
|
normal distribution with $\mu=0$ and $\sigma=1$.
|
|
}
|
|
\label{fig:app_short_kern-sd}
|
|
\end{figure}% Referenced.
|
|
\FloatBarrier
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_kernel_sd_perc_field_appendix.pdf}
|
|
\caption{\textbf{Relation between threshold value and pure-noise feature
|
|
value for Fig.\,\ref{fig:pipeline_field}.}
|
|
Proportion of pure-noise kernel response $c_i(t)$ that
|
|
exceeds threshold value $\thr$ --- which determines the
|
|
average value $\muf$ of feature $f_i(t)$ --- over $\thr$
|
|
in multiples of standard deviation $\sigma_{c_i}$.
|
|
Corresponds to a "reverse" cumulative distribution
|
|
function of $c_i(t)$. Black solid lines indicate rCDF per
|
|
kernel $k_i(t)$. Red dashed line indicates rCDF for a
|
|
normal distribution with $\mu=0$ and $\sigma=1$.
|
|
}
|
|
\label{fig:app_field_kern-sd}
|
|
\end{figure}% Referenced.
|
|
\FloatBarrier
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_invariance_cross_species_thresh_appendix.pdf}
|
|
\caption{\textbf{Threshold-dependent intensity invariance of
|
|
species-specific feature sets.}
|
|
Same processing as in Fig.\,\ref{fig:pipeline_full}, using
|
|
different kernel-specific threshold values $\thr$
|
|
(multiples of pure-noise standard deviation
|
|
$\sigma_{\eta_i}$ of $c_i(t)$ for $\sca=0$. See also
|
|
appendix Fig.\,\ref{fig:app_full_kern-sd}). Average value
|
|
$\muf$ of each feature $f_i(t)$ over $\sca$.
|
|
}
|
|
\label{fig:app_cross_species_thresh}
|
|
\end{figure}% Reference this one!
|
|
\FloatBarrier
|
|
|
|
\end{document} |