1267 lines
69 KiB
TeX
1267 lines
69 KiB
TeX
\documentclass[a4paper, 12pt]{article}
|
|
|
|
\usepackage[left=2cm,right=2cm,top=2cm,bottom=2cm,includeheadfoot]{geometry}
|
|
\usepackage[onehalfspacing]{setspace}
|
|
\usepackage{graphicx}
|
|
\usepackage{svg}
|
|
\usepackage{import}
|
|
\usepackage{float}
|
|
\usepackage{placeins}
|
|
\usepackage{parskip}
|
|
\usepackage{amsmath}
|
|
\usepackage{amssymb}
|
|
\usepackage{subcaption}
|
|
\usepackage[labelfont=bf, textfont=small]{caption}
|
|
\usepackage[german,english]{babel}
|
|
\addto\captionsenglish{\renewcommand{\figurename}{Fig.}}
|
|
\addto\captionsenglish{\renewcommand{\tablename}{Tab.}}
|
|
\usepackage[separate-uncertainty=true, locale=DE]{siunitx}
|
|
\sisetup{output-exponent-marker=\ensuremath{\mathrm{e}}}
|
|
% \usepackage[capitalize]{cleveref}
|
|
% \crefname{figure}{Fig.}{Figs.}
|
|
% \crefname{equation}{Eq.}{Eqs.}
|
|
% \creflabelformat{equation}{#2#1#3}
|
|
\usepackage[
|
|
backend=biber,
|
|
style=authoryear,
|
|
pluralothers=true,
|
|
maxcitenames=1,
|
|
mincitenames=1
|
|
]{biblatex}
|
|
\addbibresource{cite.bib}
|
|
%\bibdata
|
|
%\bibstyle
|
|
%\citation
|
|
|
|
\title{Emergent intensity invariance vs. signal-to-noise ratio at three consecutive processing stages along the grasshopper song recognition pathway}
|
|
\author{Jona Hartling, Jan Benda}
|
|
\date{}
|
|
|
|
\begin{document}
|
|
\maketitle{}
|
|
|
|
% Text references and citations:
|
|
\newcommand{\bcite}[1]{\mbox{\cite{#1}}}
|
|
% \newcommand{\fref}[1]{\mbox{\cref{#1}}}
|
|
% \newcommand{\fref}[1]{\mbox{Fig.\,\ref{#1}}}
|
|
% \newcommand{\eref}[1]{\mbox{\cref{#1}}}
|
|
% \newcommand{\eref}[1]{\mbox{Eq.\,\ref{#1}}}
|
|
|
|
% Subplot lettering:
|
|
\newcommand{\figa}{\textbf{a}}
|
|
\newcommand{\figb}{\textbf{b}}
|
|
\newcommand{\figc}{\textbf{c}}
|
|
\newcommand{\figd}{\textbf{d}}
|
|
\newcommand{\fige}{\textbf{e}}
|
|
|
|
% Math shorthands - Standard symbols:
|
|
\newcommand{\dec}{\log_{10}} % Logarithm base 10
|
|
\newcommand{\infint}{\int_{-\infty}^{+\infty}} % Indefinite integral
|
|
|
|
% Math shorthands - Spectral filtering:
|
|
\newcommand{\bp}{h_{\text{BP}}(t)} % Bandpass filter function
|
|
\newcommand{\lp}{h_{\text{LP}}(t)} % Lowpass filter function
|
|
\newcommand{\hp}{h_{\text{HP}}(t)} % Highpass filter function
|
|
\newcommand{\fc}{f_{\text{cut}}} % Filter cutoff frequency
|
|
\newcommand{\tlp}{T_{\text{LP}}} % Lowpass filter averaging interval
|
|
\newcommand{\thp}{T_{\text{HP}}} % Highpass filter adaptation interval
|
|
|
|
% Math shorthands - Early representations:
|
|
\newcommand{\raw}{x_{\text{raw}}} % Placeholder input signal
|
|
\newcommand{\filt}{x_{\text{filt}}} % Bandpass filtered signal
|
|
\newcommand{\env}{x_{\text{env}}} % Signal envelope
|
|
\newcommand{\db}{x_{\text{log}}} % Logarithmically scaled signal
|
|
\newcommand{\dbref}{x_{\text{ref}}} % Decibel reference intensity
|
|
\newcommand{\adapt}{x_{\text{adapt}}} % Adapted signal
|
|
|
|
% Math shorthands - Kernel parameters:
|
|
\newcommand{\kw}{\sigma} % Unspecific Gabor kernel width
|
|
\newcommand{\kf}{\omega} % Unspecific Gabor kernel frequency
|
|
\newcommand{\kp}{\phi} % Unspecific Gabor kernel phase
|
|
\newcommand{\kn}{n} % Unspecific Gabor kernel lobe number
|
|
% \newcommand{\ks}{s} % Unspecific Gabor kernel sign
|
|
\newcommand{\kwi}{\kw_i} % Specific Gabor kernel width
|
|
\newcommand{\kfi}{\kf_i} % Specific Gabor kernel frequency
|
|
\newcommand{\kpi}{\kp_i} % Specific Gabor kernel phase
|
|
\newcommand{\kni}{\kn_i} % Specific Gabor kernel lobe number
|
|
% \newcommand{\ksi}{\ks_i} % Specific Gabor kernel sign
|
|
|
|
% Math shorthands - Auxiliary kernel parameters:
|
|
\newcommand{\fsin}{f_{\text{sin}}} % Carrier frequency
|
|
\newcommand{\rh}{h_{\text{rel}}} % Relative Gaussian height for FWRH
|
|
\newcommand{\fwrh}{\text{FWRH}} % Gaussian full-width at relative height
|
|
\newcommand{\off}{\beta_0} % Offset for linear frequency approximation
|
|
|
|
% Math shorthands - Thresholding nonlinearity:
|
|
\newcommand{\thr}{\Theta_i} % Step function threshold value
|
|
\newcommand{\nl}{H(c_i\,-\,\thr)} % Shifted Heaviside step function
|
|
|
|
% Math shorthands - Intensity invariance analysis:
|
|
\newcommand{\soc}{s} % Song component of synthetic mixture
|
|
\newcommand{\noc}{\eta} % Noise component of synthetic mixture
|
|
\newcommand{\sca}{\alpha} % Multiplicative scale of song component
|
|
\newcommand{\xvar}{\sigma_{x}^{2}} % Variance of synthetic mixture
|
|
\newcommand{\svar}{\sigma_{\text{s}}^{2}} % Song component variance
|
|
\newcommand{\nvar}{\sigma_{\eta}^{2}} % Noise component variance
|
|
\newcommand{\pc}{p(c,\,T)} % Probability density (general interval)
|
|
\newcommand{\pclp}{p(c,\,\tlp)} % Probability density (lowpass interval)
|
|
|
|
\section{Exploring a grasshopper's sensory world}
|
|
|
|
% Why functional models of sensory systems?
|
|
Our scientific understanding of sensory processing systems results from the
|
|
distributed accumulation of anatomical, physiological and ethological evidence.
|
|
This process is undoubtedly without alternative; however, it leaves us with the
|
|
challenge of integrating the available fragments into a coherent whole in order
|
|
to address issues such as the interaction between individual system components,
|
|
the functional limitations of the system overall, or taxonomic comparisons
|
|
between systems that process the same sensory modality. Any unified framework
|
|
that captures the essential functional aspects of a given sensory system thus
|
|
has the potential to deepen our current understanding and fasciliate systematic
|
|
investigations. However, building such a framework is a challenging task. It
|
|
requires a wealth of existing knowledge of the system and the signals it
|
|
operates on, a clearly defined scope, and careful reduction, abstraction, and
|
|
formalization of the underlying structures and mechanisms.
|
|
|
|
% Why the grasshopper auditory system?
|
|
% Why focus on song recognition among other auditory functions?
|
|
One sensory system about which extensive information has been gathered over the
|
|
years is the auditory system of grasshoppers~(\textit{Acrididae}). Grasshoppers
|
|
rely on their sense of hearing primarily for intraspecific communication, which
|
|
includes mate attraction~(\bcite{helversen1972gesang}) and
|
|
evaluation~(\bcite{stange2012grasshopper}), sender
|
|
localization~(\bcite{helversen1988interaural}), courtship
|
|
display~(\bcite{elsner1968neuromuskularen}), rival
|
|
deterrence~(\bcite{greenfield1993acoustic}), and loss-of-signal predator
|
|
alarm~(SOURCE). In accordance with this rich behavioral repertoire,
|
|
grasshoppers have evolved a variety of sound production mechanisms to generate
|
|
acoustic communication signals for different contexts and ranges using their
|
|
wings, hindlegs, or mandibles~(\bcite{otte1970comparative}). Among the most
|
|
conspicuous acoustic signals of grasshoppers are their species-specific calling
|
|
songs, which broadcast the presence of the singing individual --- mostly the
|
|
males of the species --- to potential mates within range. These songs are
|
|
usually more characteristic of a species than morphological
|
|
traits~(\bcite{tishechkin2016acoustic}; \bcite{tarasova2021eurasius}), which
|
|
can vary greatly within species~(\bcite{rowell1972variable};
|
|
\bcite{kohler2017morphological}). The reliance on songs to mediate reproduction
|
|
represents a strong evolutionary driving force, that resulted in a massive
|
|
species diversification~(\bcite{vedenina2011speciation};
|
|
\bcite{sevastianov2023evolution}), with over 6800 recognized grasshopper
|
|
species in the \textit{Acrididae} family~(\bcite{cigliano2024orthoptera}). It
|
|
is this diversity of species, and the crucial role of acoustic communication in
|
|
its emergence, that makes the grasshopper auditory system an intriguing
|
|
candidate for attempting to construct a functional model framework. As a
|
|
necessary reduction, the model we propose here focuses on the pathway
|
|
responsible for the recognition of species-specific calling songs, disregarding
|
|
other essential auditory functions such as directional
|
|
hearing~(\bcite{helversen1984parallel}; \bcite{ronacher1986routes};
|
|
\bcite{helversen1988interaural}).
|
|
|
|
% What are the signals the auditory system is supposed to recognize?
|
|
% Why is intensity invariance important for song recognition?
|
|
% (Obviously, split this paragraph)
|
|
To understand the functional challenges faced by the grasshopper auditory
|
|
system, one has to understand the properties of the songs it is designed to
|
|
recognize. Grasshopper songs are amplitude-modulated broad-band acoustic
|
|
signals. Most songs are produced by stridulation, during which the animal pulls
|
|
the serrated stridulatory file on its hindlegs across a resonating vein on the
|
|
forewings~(\bcite{helversen1977stridulatory}; \bcite{stumpner1994song};
|
|
\bcite{helversen1997recognition}). Every tooth that strikes the vein generates
|
|
a brief pulse of sound. Multiple pulses make up a syllable; and the alternation
|
|
of syllables and relatively quiet pauses forms a characteristic, through noisy,
|
|
waveform pattern. Song recognition depends on certain temporal and structural
|
|
parameters of this pattern, such as the duration of syllables and
|
|
pauses~(\bcite{helversen1972gesang}), the slope of pulse
|
|
onsets~(\bcite{helversen1993absolute}), and the accentuation of syllable onsets
|
|
relative to the preceeding pause~(\bcite{balakrishnan2001song};
|
|
\bcite{helversen2004acoustic}). The amplitude modulation of the song is
|
|
sufficient for recognition~(\bcite{helversen1997recognition}). However, the
|
|
essential recognition cues can vary considerably with external physical
|
|
factors, which requires the auditory system to be invariant to such variations
|
|
in order to reliably recognize songs under different conditions. For instance,
|
|
the temporal structure of grasshopper songs warps with
|
|
temperature~(\bcite{skovmand1983song}). The auditory system can compensate for
|
|
this variability by reading out relative temporal relationships rather than
|
|
absolute time intervals~(\bcite{creutzig2009timescale};
|
|
\bcite{creutzig2010timescale}), as those remain relatively constant across
|
|
different temperatures~(\bcite{helversen1972gesang}). Another, perhaps even
|
|
more fundamental external source of song variability lays in the attenuation of
|
|
sound intensity with increasing distance to the sender. Sound attenuation
|
|
depends on both the frequency content of the signal and the vegetation of the
|
|
habitat~(\bcite{michelsen1978sound}). For the receiving auditory system, this
|
|
has two major implications. First, the amplitude dynamics of the song pattern
|
|
are steadily degraded over distance, which limits the effective communication
|
|
range of grasshoppers to~\mbox{1\,-\,2\,m} in their typical grassland
|
|
habitats~(\bcite{lang2000acoustic}). Second, the overall intensity level of
|
|
songs at the receiver's position varies depending on the location of the
|
|
sender, which should ideally not affect the recognition of the song pattern.
|
|
This neccessitates that the auditory system achieves a certain degree of
|
|
intensity invariance --- a time scale-selective sensitivity to faster amplitude
|
|
dynamics and simultaneous insensitivity to slower, more sustained amplitude
|
|
dynamics. Intensity invariance in different auditory systems is often
|
|
associated with neuronal adaptation~(\bcite{benda2008spike};
|
|
\bcite{barbour2011intensity}; \bcite{ozeri2018fast}; more
|
|
general:~\bcite{benda2021neural}). In the grasshopper auditory system, a number
|
|
of neuron types along the processing chain exhibit spike-frequency adaptation
|
|
in response to sustained stimulus
|
|
intensities~(\bcite{romer1976informationsverarbeitung};
|
|
\bcite{gollisch2004input}; \bcite{hildebrandt2009origin};
|
|
\bcite{clemens2010intensity}; \bcite{fisch2012channel}) and thus likely
|
|
contribute to the emergence of intensity-invariant song representations. This
|
|
means that intensity invariance is not the result of a single processing step
|
|
but rather a gradual process, in which different neuronal populations
|
|
contribute to varying degrees~(\bcite{clemens2010intensity}) and by different
|
|
mechanisms~(\bcite{hildebrandt2009origin}). Approximating this process within a
|
|
functional model framework thus requires a considerable amount of
|
|
simplification. In this work, we demonstrate that even a small number of basic
|
|
physiologically inspired signal transformations --- specifically, pairs of
|
|
nonlinear and linear operations --- is sufficient to achieve a meaningful
|
|
degree of intensity invariance.
|
|
|
|
% How can song recognition be modelled functionally (feat. Jan Clemens & Co.)?
|
|
% How did we expand on the previous framework?
|
|
% (Still can't stand some of this paragraph's structure and wording...)
|
|
Invariance to non-informative song variations is crucial for reliable song
|
|
recognition; however, it is not sufficient to this end. In order to recognize a
|
|
conspecific song as such, the auditory system needs to extract sufficiently
|
|
informative features of the song pattern and then integrate the gathered
|
|
information into a final categorical percept. Previous authors have proposed a
|
|
functional model framework that describes this process --- feature extraction,
|
|
evidence accumulation, and categorical decision making --- in both
|
|
crickets~(\bcite{clemens2013computational}; \bcite{hennig2014time}) and
|
|
grasshoppers~(\bcite{clemens2013feature}; review on
|
|
both:~\bcite{ronacher2015computational}). Their framework provides a
|
|
comprehensible and biologically plausible account of the computational
|
|
mechanisms required for species-specific song recognition, which has served as
|
|
the inspiration for the development of the model pathway we propose here. The
|
|
existing framework relies on pulse trains as input signals, which were designed
|
|
to capture the essential structural properties of natural song
|
|
envelopes~(\bcite{clemens2013feature}). In the first step, a bank of parallel
|
|
linear-nonlinear feature detectors is applied to the input signal. Each feature
|
|
detector consists of a convolutional filter and a subsequent sigmoidal
|
|
nonlinearity. The outputs of these feature detectors are temporally averaged to
|
|
obtain a single feature value per detector, which is then assigned a specific
|
|
weight. The linear combination of weighted feature values results in a single
|
|
preference value, that serves as predictor for the behavioral response of the
|
|
animal to the presented input signal. Our model pathway adopts the general
|
|
structure of the existing framework but modifies it in several key aspects. The
|
|
convolutional filters, which have previously been fitted to behavioral data for
|
|
each individual species~(\bcite{clemens2013computational}), are replaced by a
|
|
larger, generic set of unfitted Gabor basis functions in order to cover a wide
|
|
range of possible song features across different species. Gabor functions
|
|
approximate the general structure of the filters used in the existing framework
|
|
as well as the filter functions found in various auditory
|
|
neurons~(\bcite{rokem2006spike}; \bcite{clemens2011efficient};
|
|
\bcite{clemens2012nonlinear}). The fitted sigmoidal nonlinearities in the
|
|
existing framework consistently exhibited very steep slopes and are therefore
|
|
replaced by shifted Heaviside step-functions, which results in a binarization
|
|
of the feature detector outputs. Another, more substantial modification is that
|
|
the feature detector outputs are temporally averaged in a way that does not
|
|
condense them into single feature values but retains their time-varying
|
|
structure. This is in line with the fact that songs are no discrete units but
|
|
part of a continuous acoustic stream that the auditory system has to process in
|
|
real time. Moreover, a time-varying feature representation only stabilizes
|
|
after a certain delay following the onset of a song, which emphasizes the
|
|
temporal dynamics of evidence accumulation towards a final categorical
|
|
decision. The most notable difference between our model pathway and the
|
|
existing framework, however, lays in the addition of a physiologically inspired
|
|
preprocessing stage, whose starting point corresponds to the initial reception
|
|
of airborne sound waves. This allows the model to operate on unmodified
|
|
recordings of natural grasshopper songs instead of condensed pulse train
|
|
approximations, which widens its scope towards more realistic, ecologically
|
|
relevant scenarios. For instance, we were able to investigate the contribution
|
|
of different processing stages to the emergence of intensity-invariant song
|
|
representations based on actual field recordings of songs at different
|
|
distances from the sender.
|
|
% Forgive me, it's friday.
|
|
In the following, we outline the structure of the proposed model of the
|
|
grasshopper auditory pathway, from the initial reception of sound waves up to
|
|
the generation of a high-dimensional, time-varying feature representation that
|
|
is suitable for species-specific song recognition. We provide a side-by-side
|
|
account of the known physiological processing steps and their functional
|
|
approximation by basic mathematical operations. We then elaborate on two key
|
|
mechanisms that drive the emergence of intensity-invariant song representations
|
|
within the auditory pathway.
|
|
|
|
% SCRAPPED UNTIL FURTHER NOTICE:
|
|
% Multi-species, multi-individual communally inhabited environments\\
|
|
% - Temporal overlap: Simultaneous singing across individuals/species common\\
|
|
% - Frequency overlap: Little speciation into frequency bands (likely unused)\\
|
|
% - "Biotic noise": Hetero-/conspecifics ("Another one's songs are my noise")\\
|
|
% - "Abiotic noise": Wind, water, vegetation, anthropogenic\\
|
|
% - Effects of habitat structure on sound propagation (landscape - soundscape)\\
|
|
% $\rightarrow$ Sensory constraints imposed by the (acoustic) environment
|
|
|
|
% Cluster of auditory challenges (interlocking constraints $\rightarrow$ tight coupling):\\
|
|
% From continuous acoustic input, generate neuronal representations that...\\
|
|
% 1)...allow for the separation of relevant (song) events from ambient noise floor\\
|
|
% 2)...compensate for behaviorally non-informative song variability (invariances)\\
|
|
% 3)...carry sufficient information to characterize different song patterns,
|
|
% recognize the ones produced by conspecifics, and make appropriate behavioral
|
|
% decisions based on context (sender identity, song type, mate/rival quality)
|
|
|
|
% How can the auditory system of grasshoppers meet these challenges?\\
|
|
% - What are the minimum functional processing steps required?\\
|
|
% - Which known neuronal mechanisms can implement these steps?\\
|
|
% - Which and how many stages along the auditory pathway contribute?\\
|
|
% $\rightarrow$ What are the limitations of the system as a whole?
|
|
|
|
% How can a human observer conceive a grasshopper's auditory percepts?\\
|
|
% - How to investigate the workings of the auditory pathway as a whole?\\
|
|
% - How to systematically test effects and interactions of processing parameters?\\
|
|
% - How to integrate the available knowledge on anatomy, physiology, ethology?\\
|
|
% $\rightarrow$ Abstract, simplify, formalize $\rightarrow$ Functional model framework
|
|
|
|
\section{Developing a functional model of the\\grasshopper song recognition pathway}
|
|
|
|
% Too long (no splitting, only pruning).
|
|
The essence of constructing a functional model of a given system is to gain a
|
|
sufficient understanding of the system's essential structural components and
|
|
their presumed functional roles; and to then build a formal framework of
|
|
manageable complexity around these two aspects. Anatomically, the organization
|
|
of the grasshopper song recognition pathway can be outlined as a feed-forward
|
|
network of three consecutive neuronal
|
|
populations~(Fig.\,\mbox{\ref{fig:pathway}a-c}): Peripheral auditory receptor
|
|
neurons, whose axons enter the ventral nerve cord at the level of the
|
|
metathoracic ganglion; local interneurons that remain exclusively within the
|
|
thoracic region of the ventral nerve cord; and ascending neurons projecting
|
|
from the thoracic region towards the supraesophageal
|
|
ganglion~(\bcite{rehbein1974structure}; \bcite{rehbein1976auditory};
|
|
\bcite{eichendorf1980projections}). The input to the network originates at the
|
|
tympanal membrane, which acts as acoustic receiver and is coupled to the
|
|
dendritic endings of the receptor neurons~(\bcite{gray1960fine}). The outputs
|
|
from the network converge in the supraesophageal ganglion, which is presumed to
|
|
harbor the neuronal substrate for conspecific song recognition and response
|
|
initiation~(\bcite{ronacher1986routes}; \bcite{bauer1987separate};
|
|
\bcite{bhavsar2017brain}). Functionally, the ascending neurons are the most
|
|
diverse of the three populations along the pathway. Individual ascending
|
|
neurons possess highly specific response properties that contrast with the
|
|
rather homogeneous response properties of the preceding receptor neurons and
|
|
local interneurons~(\bcite{clemens2011efficient}), indicating a transition from
|
|
a uniform population-wide processing stream into several parallel branches.
|
|
Based on these anatomical and physiological considerations, the overall
|
|
structure of the model pathway is divided into two distinct
|
|
stages~(Fig.\,\ref{fig:pathway}d). The preprocessing stage incorporates the
|
|
known physiological processing steps at the levels of the tympanal membrane,
|
|
the receptor neurons, and the local interneurons; and operates on
|
|
one-dimensional signal representations. The feature extraction stage
|
|
corresponds to the processing within the ascending neurons and further
|
|
downstream towards the supraesophageal ganglion; and operates on
|
|
high-dimensional signal representations. The details of each physiological
|
|
processing step and its functional approximation within the two stages are
|
|
outlined in the following sections.
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_auditory_pathway.pdf}
|
|
\caption{\textbf{Schematic organisation of the grasshopper song recognition
|
|
pathway and structure of the functional model pathway.}
|
|
\textbf{a}:~Simplified course of the pathway in the
|
|
grasshopper, from the tympanal membrane over receptor
|
|
neurons, local interneurons, and ascending neurons further
|
|
towards the supraesophageal ganglion.
|
|
\textbf{b}:~Schematic of synaptic connections between
|
|
the three neuronal populations within the metathoracic
|
|
ganglion.
|
|
\textbf{c}:~Network representation of neuronal connectivity.
|
|
\textbf{d}:~Flow diagram of consecutive signal
|
|
representations~(boxes) and transformations~(arrows) along
|
|
the model pathway. All representations are time-varying.
|
|
1st half: Preprocessing stage~(one-dimensional
|
|
representation). 2nd half: Feature extraction
|
|
stage~(high-dimensional representation). }
|
|
\label{fig:pathway}
|
|
\end{figure}
|
|
|
|
\subsection{Population-driven signal preprocessing}
|
|
|
|
Grasshoppers receive airborne sound waves by a tympanal organ at either side of
|
|
the body. The tympanal membrane acts as a mechanical resonance filter for
|
|
sound-induced vibrations~(\bcite{windmill2008time}; \bcite{malkin2014energy}).
|
|
Vibrations that fall within specific frequency bands are focused on different
|
|
membrane areas, while others are attenuated. This processing step can be
|
|
approximated by an initial bandpass filter
|
|
\begin{equation}
|
|
\filt(t)\,=\,\raw(t)\,*\,\bp, \qquad \fc\,=\,5\,\text{kHz},\,30\,\text{kHz}
|
|
\label{eq:bandpass}
|
|
\end{equation}
|
|
applied to the acoustic input signal $\raw(t)$. The auditory receptor neurons
|
|
transduce the vibrations of the tympanal membrane into sequences of action
|
|
potentials. Thereby, they encode the amplitude modulation, or envelope, of the
|
|
signal~(\bcite{machens2001discrimination}), which likely involves a rectifying
|
|
nonlinearity~(\bcite{machens2001representation}). This can be modelled as
|
|
full-wave rectification followed by lowpass filtering
|
|
\begin{equation}
|
|
\env(t)\,=\,|\filt(t)|\,*\,\lp, \qquad \fc\,=\,250\,\text{Hz}
|
|
\label{eq:env}
|
|
\end{equation}
|
|
of the tympanal signal $\filt(t)$. Furthermore, the receptors exhibit a
|
|
sigmoidal response curve over logarithmically compressed intensity
|
|
levels~(\bcite{suga1960peripheral}; \bcite{gollisch2002energy}). In the model
|
|
pathway, logarithmic compression is achieved by conversion to decibel scale
|
|
\begin{equation}
|
|
\db(t)\,=\,20\,\cdot\,\dec \frac{\env(t)}{\dbref}, \qquad \dbref\,=\,1
|
|
\label{eq:log}
|
|
\end{equation}
|
|
relative to the common reference intensity $\dbref$.
|
|
Both the receptor neurons~(\bcite{romer1976informationsverarbeitung};
|
|
\bcite{gollisch2004input}; \bcite{fisch2012channel}) and, on a larger scale,
|
|
the subsequent local interneurons~(\bcite{hildebrandt2009origin};
|
|
\bcite{clemens2010intensity}) adapt their firing rates in response to sustained
|
|
stimulus intensity levels, which allows for the robust encoding of faster
|
|
amplitude modulations against a slowly changing overall baseline intensity.
|
|
Functionally, the adaptation mechanism resembles a highpass filter
|
|
\begin{equation}
|
|
\adapt(t)\,=\,\db(t)\,*\,\hp, \qquad \fc\,=\,10\,\text{Hz}
|
|
\label{eq:highpass}
|
|
\end{equation}
|
|
over the logarithmically scaled envelope $\db(t)$. This processing step
|
|
concludes the preprocessing stage of the model pathway. The resulting
|
|
intensity-adapted envelope $\adapt(t)$ is then passed on from the local
|
|
interneurons to the ascending neurons, where it serves as the basis for the
|
|
following feature extraction stage.
|
|
|
|
% Cite somewhere:
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_pre_stages.pdf}
|
|
\caption{\textbf{Representations of a song of \textit{O. rufipes} during
|
|
the preprocessing stage.}
|
|
\textbf{a}:~Bandpass filtered tympanal signal $\filt(t)$.
|
|
\textbf{b}:~Signal envelope $\env(t)$.
|
|
\textbf{c}:~Logarithmically compressed envelope $\db(t)$.
|
|
\textbf{d}:~Intensity-adapted envelope $\adapt(t)$.
|
|
}
|
|
\label{fig:stages_pre}
|
|
\end{figure}
|
|
\FloatBarrier
|
|
|
|
\subsection{Feature extraction by individual neurons}
|
|
|
|
The ascending neurons extract and encode a number of different features of the
|
|
preprocessed signal. As a population, they hence represent the signal in a
|
|
higher-dimensional space than the preceding receptor neurons and local
|
|
interneurons. Each ascending neuron is assumed to scan the signal for a
|
|
specific template pattern, which can be thought of as a kernel of a particular
|
|
structure and on a particular time scale. This process, known as template
|
|
matching, can be modelled as a convolution
|
|
\begin{equation}
|
|
c_i(t)\,=\,\adapt(t)\,*\,k_i(t)
|
|
= \infint \adapt(\tau)\,\cdot\,k_i(t\,-\,\tau)\,d\tau
|
|
\label{eq:conv}
|
|
\end{equation}
|
|
of the intensity-adapted envelope $\adapt(t)$ with a kernel $k_i(t)$ per
|
|
ascending neuron. We use Gabor kernels as basis functions for creating
|
|
different template patterns. An arbitrary one-dimensional, real Gabor kernel is
|
|
generated by multiplication of a Gaussian envelope and a sinusoidal carrier
|
|
\begin{equation}
|
|
k_i(t,\,\kwi,\,\kfi,\,\kpi)\,=\,e^{-\frac{t^{2}}{2{\kwi}^{2}}}\,\cdot\,\sin(\kfi\,t\,+\,\kpi), \qquad \kfi\,=\,2\pi\fsin
|
|
\label{eq:gabor}
|
|
\end{equation}
|
|
with Gaussian standard deviation or kernel width $\kwi$, carrier frequency
|
|
$\kfi$, and carrier phase $\kpi$. Different combinations of $\kw$ and $\kf$
|
|
result in Gabor kernels with different lobe number $\kn$, which is the number
|
|
of half-periods of the carrier that fit under the Gaussian envelope within
|
|
reasonable limits of attenuation. The interval under the Gaussian envelope that
|
|
contains the relevant lobes of the kernel can be defined as Gaussian full-width
|
|
measured at relative peak height $\rh$
|
|
\begin{equation}
|
|
\fwrh(\kw,\,\rh)\,=\,2\,\cdot\,\sqrt{-2\,\cdot\,\ln \rh}\cdot\,\kw, \qquad \rh\,\in\,(0,\,1]
|
|
\end{equation}
|
|
With this, an appropriate carrier frequency $\kf$ for obtaining a Gabor kernel
|
|
with width $\kw$ and desired lobe number $\kn$ can be approximated as
|
|
% \begin{equation}
|
|
% \kf(\kn,\,\fwrh)\,=\,\frac{0.5\,\cdot\,\kn\,+\,\off}{\fwrh}, \qquad \kn\,\geq\,2\enspace\forall\enspace \kn\,\in\,\mathbb{Z}
|
|
% \end{equation}
|
|
\begin{equation}
|
|
\kf(\kn,\,\kw,\,\rh)\,=\,\frac{\kn\,+\,\off}{4\,\cdot\,\sqrt{-2\,\cdot\,\ln \rh}}, \qquad \kn\,\geq\,2\enspace\forall\enspace \kn\,\in\,\mathbb{Z}
|
|
\end{equation}
|
|
where $\off$ is a small positive offset to the near-linear relationship between
|
|
$\kf$ and $\kn$ to balance the amplitude of the $\kn$ desired lobes of the
|
|
kernel --- which should be maximized --- against the amplitude of the
|
|
next-outer lobes, which should not exceed the threshold value determined by
|
|
$\rh$. For $\kn=1$, carrier frequency $\kf$ is set to zero, which results in a
|
|
simple Gaussian kernel. Carrier phase $\kp$ determines the position of the
|
|
kernel lobes relative to the kernel center. By setting $\kp$ to one of only
|
|
four specific phase values~(Tab.\,\ref{tab:gabor_phases}), we restrict the
|
|
Gabor kernels to be either even functions~(mirror-symmetric, uneven $\kn$) or
|
|
odd functions~(point-symmetric, even $\kn$) with either positive or negative
|
|
sign, which refers to the sign of the kernel's central lobe (even kernels) or
|
|
the left of the two central lobes (odd kernels).
|
|
\FloatBarrier
|
|
\begin{table}[!ht]
|
|
\centering
|
|
\captionsetup{width=.46\textwidth}
|
|
\caption{Values of phase $\kp$ that are specific for the four major groups
|
|
of Gabor kernels.}
|
|
\begin{tabular}{|ccc|}
|
|
\hline
|
|
sign & even kernels & odd kernels\\
|
|
\hline
|
|
$+$ & $+\pi\,/\,2$ & $\pi$\\
|
|
$-$ & $-\pi\,/\,2$ & $0$\\
|
|
\hline
|
|
\end{tabular}
|
|
\label{tab:gabor_phases}
|
|
\end{table}
|
|
\FloatBarrier
|
|
These four major groups of Gabor kernels allow for the extraction of different
|
|
types of signal features, such as the presence of peaks (even, $+$), troughs
|
|
(even, $-$), onsets (odd, $+$), and offsets (odd, $-$) at various time scales.
|
|
% Add kernel normalization here.
|
|
Following the convolutional template matching, each kernel-specific response
|
|
$c_i(t)$ is passed through a shifted Heaviside step-function $\nl$ with
|
|
threshold value $\thr$ to obtain a binary response
|
|
\begin{equation}
|
|
b_i(t,\,\thr)\,=\,\begin{cases}
|
|
\;1, \quad c_i(t)\,>\,\thr\\
|
|
\;0, \quad c_i(t)\,\leq\,\thr
|
|
\end{cases}
|
|
\label{eq:binary}
|
|
\end{equation}
|
|
which can be thought of as a categorization into "relevant" and "irrelevant"
|
|
response values. In the grasshopper, these thresholding nonlinearities might
|
|
either be part of the processing within the ascending neurons or take place
|
|
further downstream~(SOURCE). Finally, the responses of the ascending neurons
|
|
are assumed to be integrated somewhere in the supraesophageal
|
|
ganglion~(\bcite{ronacher1986routes}; \bcite{bauer1987separate};
|
|
\bcite{bhavsar2017brain}). This processing step can be approximated as temporal
|
|
averaging of the binary responses $b_i(t)$ by a lowpass filter
|
|
\begin{equation}
|
|
f_i(t)\,=\,b_i(t)\,*\,\lp, \qquad \fc\,=\,1\,\text{Hz}
|
|
\label{eq:lowpass}
|
|
\end{equation}
|
|
to obtain a final set of slowly changing kernel-specific features $f_i(t)$. In
|
|
the resulting high-dimensional feature space, different species-specific song
|
|
patterns are characterized by a distinct combination of feature values, which
|
|
can be read out by a simple linear classifier.
|
|
|
|
% Cite somewhere:
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_feat_stages.pdf}
|
|
\caption{\textbf{Representations of a song of \textit{O. rufipes} during
|
|
the feature extraction stage.}
|
|
Different color shades indicate different types of Gabor
|
|
kernels with specific lobe number $\kn$ and either $+$ or
|
|
$-$ sign, sorted (dark to light) first by increasing $\kn$
|
|
and then by sign~($1\,\leq\,\kn\,\leq\,4$; first $+$, then
|
|
$-$ for each $\kn$; two kernel widths $\kw$ of $4\,$ms and
|
|
$32\,$ms per type; 8 types, 16 kernels in total).
|
|
\textbf{a}:~Kernel-specific filter responses $c_i(t)$.
|
|
\textbf{b}:~Binary responses $b_i(t)$.
|
|
\textbf{c}:~Finalized features $f_i(t)$.}
|
|
\label{fig:stages_feat}
|
|
\end{figure}
|
|
\FloatBarrier
|
|
|
|
\section{Mechanisms driving the emergence of\\intensity-invariant song representation}
|
|
|
|
% Still missing the SNR analysis. Should be able to write around it for now.
|
|
The robustness of song recognition is tied to the degree of intensity
|
|
invariance of the finalized feature representation. Ideally, the values of each
|
|
feature should depend only on the relative amplitude dynamics of the song
|
|
pattern but not on the overall intensity of the song. In the grasshopper, the
|
|
emergence of intensity-invariant representations along the song recognition
|
|
pathway likely is a distributed process that involves different neuronal
|
|
populations, which raises the question of what the essential computational
|
|
mechanisms are that drive this process. Within the model pathway, we identified
|
|
two key mechanisms that render the song representation more invariant to
|
|
intensity variations. The two mechanisms each comprise a nonlinear signal
|
|
transformation followed by a linear signal transformation but differ in the
|
|
specific operations involved, as outlined in the following sections.
|
|
|
|
\subsection{Full-wave rectification \& lowpass filtering}
|
|
|
|
The first nonlinear transformation along the model pathway is the full-wave
|
|
rectification of the tympanal signal $\filt(t)$ during the extraction of the
|
|
signal envelope (Eq.\,\ref{eq:env}). Rectification transforms the distribution
|
|
of $\filt(t)$ from an approximately zero-centered distribution with both
|
|
positive and negative values into a strictly non-negative distribution. Signal
|
|
envelope $\env(t)$ is then obtained by lowpass filtering the rectified
|
|
$\filt(t)$. The effects of this transformation pair on SNR and potential
|
|
intensity invariance were analyzed by rescaling and processing the input signal
|
|
$\raw(t)$ and comparing standard deviations between the resulting $\filt(t)$
|
|
and $\env(t)$, once for the noiseless case~(Fig.\,\ref{fig:rect-lp}a) and once
|
|
for the noisy case~(Fig.\,\ref{fig:rect-lp}b). In addition, the cutoff
|
|
frequency $\fc$ of the lowpass filter was varied to investigate the influence
|
|
of different filter bandwidths. In the noiseless case, the standard deviations
|
|
of $\filt(t)$ and $\env(t)$ are each reduced compared to the input $\raw(t)$ by
|
|
a multiplicative factor. These factors are constant across all $\sca$, which
|
|
results in a downward shift of the respective curve on a double-logarithmic
|
|
scale, away from the diagonal~(Fig.\,\ref{fig:rect-lp}c). For $\filt(t)$, the
|
|
reduction is a consequence of the bandpass filtering~(Eq.\,\ref{eq:bandpass})
|
|
of $\raw(t)$. For $\env(t)$, the standard deviation is further reduced compared
|
|
to $\filt(t)$. Rectification contributes much less to this reduction than
|
|
lowpass filtering. The degree of reduction by lowpass filtering depends on the
|
|
cutoff frequency $\fc$, with lower $\fc$ (narrow bandwidth) resulting in a
|
|
stronger reduction. In the noisy case, the standard deviations of $\filt(t)$
|
|
and $\env(t)$ can be related to the respective pure-noise reference standard
|
|
deviation~(Fig.\,\ref{fig:rect-lp}d). This causes each curve to start with a
|
|
constant regime of SNR values near 1 for smaller $\sca$, which reflects the
|
|
dominance of the noise component $\noc(t)$ over the song component $\soc(t)$ in
|
|
the input $\raw(t)$. For larger $\sca$, all curves transition into a regime of
|
|
linearly increasing SNR on a double-logarithmic scale. For $\filt(t)$, the
|
|
linear part of the curve deviates only slightly from the diagonal. For
|
|
$\env(t)$, however, the transition occurs at lower $\sca$ compared to
|
|
$\filt(t)$, and the linear part of the curve is shifted leftward away from the
|
|
diagonal, which means that higher SNR values are achieved for the same $\sca$.
|
|
This effect is more pronounced for lower $\fc$ of the lowpass filter and is
|
|
presumably caused by the attenuation of high-frequency components in the
|
|
signal, which are more prominent in the noise component $\noc(t)$ than in the
|
|
song component $\soc(t)$. The effect also appears relatively consistent across
|
|
different species, although small variations exist~(Fig.\,\ref{fig:rect-lp}e)
|
|
that are presumably based on different song structures and frequency spectra.
|
|
In summary, the standard deviation of $\env(t)$ has never been observed to
|
|
transition into a saturation regime for larger $\sca$ but rather continues to
|
|
increase proportionally to $\sca$ for all tested $\fc$, in both the noiseless
|
|
and the noisy case and across different species. Consequently, the combination
|
|
of rectification and lowpass filtering does not contribute to intensity
|
|
invariance. However, this transformation pair does improve the SNR of $\env(t)$
|
|
relative to $\filt(t)$ and thus provides subsequent processing stages with a
|
|
more robust input representation and higher input SNR.
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_invariance_rect_lp.pdf}
|
|
\caption{\textbf{Rectification and lowpass filtering improves SNR
|
|
but does not contribute to intensity invariance.}
|
|
Input $\raw(t)$ consists of song component $\soc(t)$ scaled by
|
|
$\sca$ with optional noise component $\noc(t)$ and is
|
|
successively transformed into tympanal signal $\filt(t)$ and
|
|
envelope $\env(t)$. Different line styles indicate different
|
|
cutoff frequencies $\fc$ of the lowpass filter extracting
|
|
$\env(t)$.
|
|
\textbf{Top}:~Example representations of $\filt(t)$ and
|
|
$\env(t)$ for different $\sca$.
|
|
\textbf{a}:~Noiseless case.
|
|
\textbf{b}:~Noisy case.
|
|
\textbf{Bottom}:~Intensity metrics over a range of $\sca$.
|
|
\textbf{c}:~Noiseless case: Standard deviations $\sigma_x$ of
|
|
$\filt(t)$ and $\env(t)$.
|
|
\textbf{d}:~Noisy case: Ratios of $\sigma_x$ of $\filt(t)$ and
|
|
$\env(t)$ to the respective reference standard deviation
|
|
$\sigma_{\eta}$ for input $\raw(t)=\noc(t)$.
|
|
\textbf{e}:~Ratios of $\sigma_x$ to $\sigma_{\eta}$ of
|
|
$\env(t)$ as in \textbf{d} for different species (averaged
|
|
over songs and recordings, see appendix
|
|
Fig.\,\ref{fig:app_rect-lp}).
|
|
}
|
|
\label{fig:rect-lp}
|
|
\end{figure}
|
|
\FloatBarrier
|
|
|
|
\subsection{Logarithmic compression \& spike-frequency adaptation}
|
|
|
|
The second nonlinear transformation along the model pathway is the logarithmic
|
|
compression of the signal envelope $\env(t)$ into $\db(t)$, Eq.\,\ref{eq:log},
|
|
which is then followed by the highpass filtering of $\db(t)$,
|
|
Eq.\,\ref{eq:highpass}, to obtain the intensity-adapted envelope $\adapt(t)$.
|
|
The interplay of this transformation pair was analyzed by rescaling and
|
|
processing the input signal $\filt(t)$ and comparing standard deviations
|
|
between the resulting $\env(t)$, $\db(t)$, and $\adapt(t)$. It is necessary to
|
|
use $\filt(t)$ as input for this analysis instead of $\env(t)$, because
|
|
$\env(t)$ results from a nonlinear transformation and hence cannot be
|
|
synthesized as an additive mixture of song component $\soc(t)$ and noise
|
|
component $\noc(t)$. % <-- Sentence may be methods section material.
|
|
However, it is much easier to conceive a mathematical description of the
|
|
effects of logarithmic compression and adaptation if $\env(t)$ itself is
|
|
assumed to be composed of $\soc(t)$ and $\noc(t)$. In the noiseless
|
|
case~(Fig.\,\ref{fig:log-hp}a), $\env(t)$ takes the form of
|
|
\begin{equation}
|
|
\env(t)\,=\,\sca\,\cdot\,\soc(t), \qquad \env(t)\,>\,0\enspace\forall\enspace t\,\in\,\mathbb{R}
|
|
\label{eq:toy_env_pure}
|
|
\end{equation}
|
|
The standard deviation of $\env(t)$ increases linearly with $\sca$ on a
|
|
double-logarithmic scale and is slightly reduced~(Fig.\,\ref{fig:log-hp}c)
|
|
compared to the input $\filt(t)$, which is consistent with the results of the
|
|
previous analysis~(Fig.\,\ref{fig:rect-lp}c). By conversion of $\env(t)$ to
|
|
decibel scale, $\sca$ turns from a multiplicative scale in linear space into an
|
|
additive term, or offset, in logarithmic space:
|
|
\begin{equation}
|
|
\db(t)\,=\,20\,\cdot\,\dec \left[\,\sca\,\cdot\,s(t)\,\right]\,=\,20\,\cdot\,\left[\dec \sca\,+\,\dec s(t)\right], \qquad \sca\,>\,0
|
|
\label{eq:toy_log_pure}
|
|
\end{equation}
|
|
The highpass filtering of $\db(t)$ can be approximated as a subtraction of the
|
|
local signal offset within a suitable time interval $0 \ll \thp <
|
|
\frac{1}{\fc}$:
|
|
\begin{equation}
|
|
\begin{split}
|
|
\adapt(t)\,\approx\,\db(t)\,-\,20\,\cdot\,\dec \sca\,=\,20\,\cdot\,\dec s(t)
|
|
\end{split}
|
|
\label{eq:toy_highpass_pure}
|
|
\end{equation}
|
|
This eliminates $\sca$ from $\adapt(t)$ and thus renders it perfectly
|
|
intensity-invariant, with a constant standard deviation of around 10\,dB across
|
|
all $\sca>0$~(Fig.\,\ref{fig:log-hp}c). In contrast, in the noisy
|
|
case~(Fig.\,\ref{fig:log-hp}b), $\env(t)$ takes the form of
|
|
\begin{equation}
|
|
\env(t)\,=\,\sca\,\cdot\,\soc(t)\,+\,\noc(t), \qquad \env(t)\,>\,0\enspace\forall\enspace t\,\in\,\mathbb{R}
|
|
\label{eq:toy_env_noise}
|
|
\end{equation}
|
|
Similar to the previous analysis~(Fig.\,\ref{fig:rect-lp}d), the ratio of the
|
|
standard deviation of $\env(t)$ to its pure-noise reference standard deviation
|
|
on a double-logarithmic scale follows a constant regime for small $\sca$ and a
|
|
linearly increasing regime for larger $\sca$~(Fig.\,\ref{fig:log-hp}d). Decibel
|
|
conversion of $\env(t)$
|
|
% \begin{equation}
|
|
% \begin{split}
|
|
% \db(t)\,&=\,20\,\cdot\,\dec \left[\,\sca\,\cdot\,s(t)\,+\,\eta(t)\,\right]\\
|
|
% &=\,20\,\cdot\,\left(\dec \sca\,+\,\dec \left[s(t)\,+\,\frac{\eta(t)}{\sca}\right]\right), \qquad \sca\,>\,0
|
|
% \end{split}
|
|
% \label{eq:toy_log_noise}
|
|
% \end{equation}
|
|
\begin{equation}
|
|
\db(t)\,=\,20\,\cdot\,\left(\dec \sca\,+\,\dec \left[s(t)\,+\,\frac{\eta(t)}{\sca}\right]\right), \qquad \sca\,>\,0
|
|
\label{eq:toy_log_noise}
|
|
\end{equation}
|
|
allows for the separation of $\sca$ from $\soc(t)$ but introduces a scaling of
|
|
$\noc(t)$ by the inverse of $\sca$, which remains present even after the offset
|
|
subtraction:
|
|
\begin{equation}
|
|
\begin{split}
|
|
\adapt(t)\,\approx\,20\,\cdot\,\dec\left[s(t)\,+\,\frac{\eta(t)}{\sca}\right]
|
|
\end{split}
|
|
\label{eq:toy_highpass_noise}
|
|
\end{equation}
|
|
% \begin{equation}
|
|
% \begin{split}
|
|
% \adapt(t)\,\approx\,\db(t)\,-\,20\,\cdot\,\dec \sca\,=\,20\,\cdot\,\dec\left[s(t)\,+\,\frac{\eta(t)}{\sca}\right]
|
|
% \end{split}
|
|
% \label{eq:toy_highpass_noise}
|
|
% \end{equation}
|
|
This means that, in the noisy case, $\sca$ cannot be entirely eliminated from
|
|
$\adapt(t)$, only redistributed between $\soc(t)$ and $\noc(t)$. If $\sca$ is
|
|
sufficiently large ($\sca\gg1$, saturation regime), $\noc(t)$ is attenuated to
|
|
the point of being negligible, so that $\adapt(t)$ is a scale-free
|
|
representation of $\soc(t)$. If $\sca$ and $\noc(t)$ are at similar scales
|
|
($\sca\approx1$, transient regime), $\adapt(t)$ largely resembles $\db(t)$.
|
|
Finally, if $\sca$ is sufficiently small ($0<\sca\ll1$, noise regime),
|
|
$\noc(t)$ masks $\soc(t)$ even after the intensity adaptation. Accordingly, the
|
|
effective intensity invariance of $\adapt(t)$ through logarithmic compression
|
|
and adaptation is limited by the SNR of $\env(t)$: Songs that have already
|
|
sunken into the noise floor at the level of $\env(t)$ cannot be recovered by
|
|
subsequent processing steps, which emphasizes the importance of the SNR
|
|
improvement by rectification and lowpass filtering during the previous
|
|
processing step~(Fig.\,\ref{fig:rect-lp}d). The general pattern of noise
|
|
regime, transient regime, and saturation regime remains consistent across
|
|
different species~(Fig.\,\ref{fig:log-hp}e). However, the specific value of
|
|
$\sca$ at which the saturation regime is reached (see appendix
|
|
Fig.\,\ref{fig:app_log-hp_saturation}) and the maximum SNR value of $\adapt(t)$
|
|
within the saturation regime vary considerably between and within species. For
|
|
example, \textit{C. biguttulus} and \textit{C. mollis} display a noticably
|
|
lower maximum SNR of $\adapt(t)$ compared to other species. These differences
|
|
are not to be underestimated, since the SNR of $\adapt(t)$ within the
|
|
saturation regime determines the maximum input SNR for subsequent processing
|
|
steps. In other words, the fact that $\adapt(t)$ eventually reaches a
|
|
saturation regime is, of course, desirable in the context of intensity
|
|
invariance, but it also means to pass up on the higher SNR values that are
|
|
achieved by $\env(t)$ for the same $\sca$ (up to several orders of magnitude,
|
|
Fig.\,\ref{fig:log-hp}d). This trade-off between intensity invariance and SNR
|
|
is a recurring phenomenon that is further addressed in the following sections.
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_invariance_log_hp.pdf}
|
|
\caption{\textbf{Intensity invariance through logarithmic compression and
|
|
adaptation is restricted by the noise floor and decreases
|
|
SNR.}
|
|
Input $\filt(t)$ consists of song component $\soc(t)$
|
|
scaled by $\sca$ with optional noise component $\noc(t)$
|
|
and is successively transformed into envelope $\env(t)$,
|
|
logarithmically compressed envelope $\db(t)$, and
|
|
intensity-adapted envelope $\adapt(t)$.
|
|
\textbf{Top}:~Example representations of $\env(t)$,
|
|
$\db(t)$, and $\adapt(t)$ for different $\sca$.
|
|
\textbf{a}:~Noiseless case.
|
|
\textbf{b}:~Noisy case.
|
|
\textbf{Bottom}:~Intensity metrics over a range of $\sca$.
|
|
\textbf{c}:~Noiseless case: Standard deviations $\sigma_x$
|
|
of $\env(t)$, $\db(t)$, and $\adapt(t)$.
|
|
\textbf{d}:~Noisy case: Ratios of $\sigma_x$ of $\env(t)$,
|
|
$\db(t)$, and $\adapt(t)$ to the respective reference
|
|
standard deviation $\sigma_{\eta}$ for input
|
|
$\filt(t)=\noc(t)$. Shaded areas indicate $5\,\%$ (dark
|
|
grey) and $95\,\%$ (light grey) curve span for
|
|
$\adapt(t)$.
|
|
\textbf{e}:~Ratios of $\sigma_x$ to $\sigma_{\eta}$ of
|
|
$\adapt(t)$ as in \textbf{d} for different species
|
|
(averaged over songs and recordings, see appendix
|
|
Fig\,\ref{fig:app_log-hp_curves}). Dots indicate $95\,\%$
|
|
curve span per species.
|
|
}
|
|
\label{fig:log-hp}
|
|
\end{figure}
|
|
\FloatBarrier
|
|
|
|
\subsection{Thresholding nonlinearity \& temporal averaging}
|
|
|
|
The third nonlinear transformation along the model pathway is the thresholding
|
|
nonlinearity $\nl$ that transforms each kernel response $c_i(t)$ into a binary
|
|
binary response $b_i(t)$, Eq.\,\ref{eq:binary}. This transformation takes place
|
|
after the convolutional filtering of $\adapt(t)$ with kernel $k_i(t)$,
|
|
Eq.\,\ref{eq:conv}, and is followed by the temporal averaging of $b_i(t)$ into
|
|
the feature set $f_i(t)$ by a lowpass filter, Eq.\,\ref{eq:lowpass}. The
|
|
effects of thresholding and temporal averaging are best illustrated based on a
|
|
single kernel~(Fig.\,\ref{fig:thresh-lp_single}) instead of the full set. For
|
|
this analysis, input $\adapt(t)$ was
|
|
rescaled~(Fig.\,\ref{fig:thresh-lp_single}a) and convolved with kernel $k(t)$.
|
|
The resulting kernel response $c(t)$ was passed through $H(c\,-\,\Theta)$ with
|
|
three different threshold values
|
|
$\Theta$~(Fig.\,\ref{fig:thresh-lp_single}b-d). Each resulting binary response
|
|
$b(t)$ was transformed into $f(t)$, whose average feature value serves as a
|
|
measure of intensity~(Fig.\,\ref{fig:thresh-lp_single}ef). The thresholding
|
|
nonlinearity $H(c\,-\,\Theta)$ categorizes the values of $c(t)$ into "relevant"
|
|
($c(t)>\Theta$, $b(t)=1$) and "irrelevant" ($c(t)\leq\Theta$, $b(t)=0$)
|
|
response values. It thereby splits the probability density $\pc$ of $c(t)$
|
|
within some observed time interval $T$ into two complementary parts around
|
|
$\Theta$:
|
|
\begin{equation}
|
|
\int_{\Theta}^{+\infty} \pc\,dc\,=\,1\,-\,\int_{-\infty}^{\Theta} \pc\,dc\,=\,\frac{T_1}{T}, \qquad \infint \pc\,dc\,=\,1
|
|
\label{eq:pdf_split}
|
|
\end{equation}
|
|
The right-sided part of the split $\pc$ corresponds to time $T_1$ where
|
|
$c(t)>\Theta$, while the left-sided part corresponds to time $T_0=T-T_1$ where
|
|
$c(t)\leq\Theta$. The semi-definite integral over the right-sided part of $\pc$
|
|
represents the ratio of time $T_1$ to total time $T$ because the indefinite
|
|
integral of a probability density is normalized to 1. The lowpass filtering of
|
|
$b(t)$ can be approximated as temporal averaging over a suitable time interval
|
|
$\tlp>\frac{1}{\fc}$ in order to express $f(t)$ as a similar temporal ratio
|
|
\begin{equation}
|
|
f(t)\,\approx\,\frac{1}{\tlp} \int_{t}^{t\,+\,\tlp} b(\tau)\,d\tau\,=\,\frac{T_1}{\tlp}, \qquad b(t)\,\in\,\{0,\,1\}
|
|
\label{eq:feat_avg}
|
|
\end{equation}
|
|
of time $T_1$ during which $b(t)$ is 1 within the averaging interval $\tlp$.
|
|
Therefore, the value of $f(t)$ at every time point $t$ approximately signifies
|
|
the cumulative probability that $c(t)$ exceeds $\Theta$ during the
|
|
corresponding averaging interval $\tlp$:
|
|
\begin{equation}
|
|
f(t)\,\approx\,\int_{\Theta}^{+\infty} \pclp\,dc\,=\,P(c\,>\,\Theta,\,\tlp)
|
|
\label{eq:feat_prop}
|
|
\end{equation}
|
|
In a sense, $f(t)$ can be interpreted as some sort of duty cycle with respect
|
|
to $\Theta$. For example, a feature value of $f(t)=0.4$ means that $c(t)$
|
|
exceeds $\Theta$ for approximately 40\,\% of the time within $\tlp$ around $t$.
|
|
In the most extreme cases, $\Theta$ lays either above the maximum of $c(t)$ or
|
|
below the minimum of $c(t)$, which results in a minimum or maximum possible
|
|
feature value of $f(t)=0$~(Fig.\,\ref{fig:thresh-lp_single}d, left column) or
|
|
$f(t)=1$, respectively.
|
|
|
|
Importantly, $f(t)$ neither retains information about the timing of individual
|
|
threshold crossings nor the precise values of $c(t)$ apart from their relation
|
|
to $\Theta$. Accordingly, for a given $\Theta$, different $\sca$ can still
|
|
result in similar $T_1$ segments (and hence similar feature values) depending
|
|
on the magnitude of the derivative of $c(t)$ in temporal proximity to time
|
|
points at which $c(t)$ crosses $\Theta$: The steeper the slope of $c(t)$, the
|
|
less $T_1$ changes with variations in $\sca$. The most reliable way of
|
|
exploiting this invariant porperty of $f(t)$ is to set $\Theta$ to a value near
|
|
0, because these values are least affected by different scales of $c(t)$. For
|
|
sufficiently large $\sca$, $f(t)$ then approaches the same constant value in
|
|
both the noiseless and the noisy case~(Fig.\,\ref{fig:thresh-lp_single}e,
|
|
saturation regime).
|
|
|
|
The value of $f(t)$ in the saturation regime is independent of the precise
|
|
value of $\Theta$, but the value of $\sca$ at which the saturation regime is
|
|
reached decreses with $\Theta$~(Fig.\,\ref{fig:thresh-lp_single}e). Therefore,
|
|
a threshold value of $\Theta=0$ would be the optimal choice for achieving
|
|
intensity invariance at the lowest possible $\sca$. In stark contrast, the
|
|
closer $\Theta$ is to 0, the higher the pure-noise response of $f(t)$ and the
|
|
lower the resulting SNR of $f(t)$ between noise regime and saturation
|
|
regime~(Fig.\,\ref{fig:thresh-lp_single}b-d, left column, and
|
|
Fig.\,\ref{fig:thresh-lp_single}e). It is even possible to achieve an
|
|
"unlimited" SNR of $f(t)$ by setting $\Theta$ above the maximum of the
|
|
pure-noise $c(t)$, so that any value of $f(t)$ greater than 0 indicates the
|
|
presence of the song component $\soc(t)$ in input $\adapt(t)$ at the cost of
|
|
requiring a higher $\sca$ to reach the saturation regime. This trade-off
|
|
between intensity invariance and SNR has already been observed during the
|
|
previous analysis on logarithmic compression and
|
|
adaptation~(Fig.\,\ref{fig:log-hp}d). However, the parameters that determine
|
|
the SNR of $\adapt(t)$ are much less understood and likely relate to properties
|
|
of the signal, whereas the SNR of $f(t)$ depends on the choice of $\Theta$ and
|
|
can be more directly manipulated by the system.
|
|
|
|
Finally, the effects of thresholding and temporal averaging must be seen in the
|
|
context of the previous transformation pair of logarithmic compression and
|
|
adaptation.
|
|
|
|
Finally, the question remains whether the intensity-invariant output $\adapt(t)$
|
|
of the previous transformation pair allows feature
|
|
|
|
Finally, the output $\adapt(t)$ of the previous transformation
|
|
pair~(Fig.\,\ref{fig:log-hp}cd) can be related to the input $\adapt(t)$ of the
|
|
current transformation pair by plotting the values of $f(t)$ over the standard
|
|
deviation of input $\adapt(t)$ instead of
|
|
$\sca$~(Fig.\,\ref{fig:thresh-lp_single}f). This is relevant because, unlike
|
|
$\sca$, the standard deviation of $\adapt(t)$ is capped to a maximum value of
|
|
around 10\,dB by the previous transformation pair~(Fig.\,\ref{fig:log-hp}cd)
|
|
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_invariance_thresh_lp_single.pdf}
|
|
\caption{\textbf{Intensity invariance through thresholding and temporal
|
|
averaging is mediated by the interaction of threshold
|
|
value and noise floor.}
|
|
Input $\adapt(t)$ consists of song component $\soc(t)$
|
|
scaled by $\sca$ with optional noise component $\noc(t)$
|
|
and is transformed into single kernel response $c(t)$,
|
|
binary response $b(t)$, and feature $f(t)$. Different
|
|
color shades indicate different threshold values $\Theta$
|
|
(multiples of reference standard deviation $\sigma_{\eta}$
|
|
of $c(t)$ for input $\adapt(t)=\noc(t)$, with darker
|
|
colors for higher $\Theta$).
|
|
\textbf{Left}:~Noisy case: Example representations of
|
|
$\adapt(t)$ as well as $c(t)$, $b(t)$, and $f(t)$ for
|
|
different $\sca$.
|
|
\textbf{a}:~$\adapt(t)$ with kernel $k(t)$ in black.
|
|
\textbf{b\,-\,d}: $c(t)$, $b(t)$, and $f(t)$ based on the
|
|
same $\adapt(t)$ from \textbf{a} but with different
|
|
$\Theta$.
|
|
\textbf{Right}:~Average value $\mu_f$ of $f(t)$ for each
|
|
$\Theta$ from \textbf{b\,-\,d}. Dots indicate $95\,\%$
|
|
curve span (noisy case).
|
|
\textbf{e}:~$\mu_f$ over a range of $\sca$, once for the
|
|
noisy case (solid lines) and once for the noiseless case
|
|
(dotted lines).
|
|
\textbf{f}:~Noisy case: $\mu_f$ over the standard
|
|
deviation of input $\adapt$ corresponding to the values of
|
|
$\sca$ shown in \textbf{e}. Shaded area indicates standard
|
|
deviations that would be capped in the output $\adapt(t)$
|
|
of the previous transformation pair (see
|
|
Fig.\,\ref{fig:log-hp}cd).
|
|
}
|
|
\label{fig:thresh-lp_single}
|
|
\end{figure}
|
|
\FloatBarrier
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_invariance_thresh_lp_species.pdf}
|
|
\caption{\textbf{Feature representation of different species-specific songs
|
|
saturates at different points in feature space.}
|
|
Same input and processing as in
|
|
Fig.\,\ref{fig:thresh-lp_single} but with three different
|
|
kernels $k_i$, each with a single kernel-specific
|
|
threshold value $\thr=0.5\cdot\sigma_{\eta_i}$.
|
|
\textbf{a}:~Examples of species-specific grasshopper
|
|
songs.
|
|
\textbf{Middle}:~Average value $\mu_{f_i}$ of each feature
|
|
$f_i(t)$ over $\sca$ per species (averaged over songs and
|
|
recordings, see appendix
|
|
Figs.\,\ref{fig:app_thresh-lp_pure} and
|
|
\ref{fig:app_thresh-lp_noise}). Different color shades
|
|
indicate different kernels $k_i$. Dots indicate $95\,\%$
|
|
curve span per $k_i$.
|
|
\textbf{b}:~Noiseless case.
|
|
\textbf{c}:~Noisy case.
|
|
\textbf{Bottom}:~2D feature spaces spanned by each pair of
|
|
$f_i(t)$. Each trajectory corresponds to a
|
|
species-specific combination of $\mu_{f_i}$ that develops
|
|
with $\sca$ (colorbars). Horizontal dashes in the colorbar
|
|
indicate $5\,\%$ (dark grey) and $95\,\%$ (light grey)
|
|
curve span of the norm across all three $\mu_{f_i}$ per
|
|
species.
|
|
\textbf{d}:~Noiseless case.
|
|
\textbf{e}:~Noisy case. Shaded areas indicate the average
|
|
minimum $\mu_{f_i}$ across all species-specific trajectories.
|
|
}
|
|
\label{fig:thresh-lp_species}
|
|
\end{figure}
|
|
\FloatBarrier
|
|
% \caption{\textbf{Rectification and lowpass filtering improves SNR
|
|
% but does not contribute to intensity invariance.}
|
|
% Input $\raw(t)$ consists of song component $\soc(t)$ scaled by
|
|
% $\sca$ with optional noise component $\noc(t)$ and is
|
|
% successively transformed into tympanal signal $\filt(t)$ and
|
|
% envelope $\env(t)$. Different line styles indicate different
|
|
% cutoff frequencies $\fc$ of the lowpass filter extracting
|
|
% $\env(t)$.
|
|
% \textbf{Top}:~Example representations of $\filt(t)$ and
|
|
% $\env(t)$ for different $\sca$.
|
|
% \textbf{a}:~Noiseless case.
|
|
% \textbf{b}:~Noisy case.
|
|
% \textbf{Bottom}:~Intensity metrics over a range of $\sca$.
|
|
% \textbf{c}:~Noiseless case: Standard deviations of $\filt(t)$
|
|
% and $\env(t)$.
|
|
% \textbf{d}:~Noisy case: Ratios of standard deviations of
|
|
% $\filt(t)$ and $\env(t)$ to the respective reference standard
|
|
% deviation for input $\raw(t)=\noc(t)$.
|
|
% \textbf{e}:~Ratios of standard deviations of $\env(t)$ as in
|
|
% \textbf{b} for different species (averaged over songs and
|
|
% recordings, see appendix Fig.\,\ref{fig:app_rect-lp}).
|
|
% }
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_invariance_full_Omocestus_rufipes.pdf}
|
|
\caption{\textbf{Step-wise emergence of intensity-invariant song
|
|
representation along the full model pathway.}
|
|
Input $\raw(t)$ consists of song component $\soc(t)$
|
|
scaled by $\sca$ with added noise component $\noc(t)$ and
|
|
is processed up to the feature set $f_i(t)$. Different
|
|
color shades indicate different types of Gabor kernels
|
|
with specific lobe number $\kn$ and either $+$ or $-$
|
|
sign, sorted (dark to light) first by increasing $\kn$ and
|
|
then by sign~($1\,\leq\,\kn\,\leq\,4$; first $+$, then $-$
|
|
for each $\kn$; five kernel widths $\kw$ of 1, 2, 4, 8,
|
|
and $16\,$ms per type; 8 types, 40 kernels in total).
|
|
\textbf{a}:~Example representations of $\filt(t)$,
|
|
$\env(t)$, $\db(t)$, $\adapt(t)$, $c_i(t)$, and $f_i(t)$
|
|
for different $\sca$.
|
|
\textbf{b}:~Intensity metrics over $\sca$. For $c_i(t)$
|
|
and $f_i(t)$, the median over kernels is shown. Dots
|
|
indicate $95\,\%$ curve span for $\db(t)$, $\adapt(t)$,
|
|
$c_i(t)$, and $f_i(t)$.
|
|
\textbf{c}:~Average value $\mu_{f_i}$ of each feature
|
|
$f_i(t)$ over $\sca$.
|
|
\textbf{d}:~Ratios of intensity metrics to the respective
|
|
reference value for input $\raw(t)=\noc(t)$. For $c_i(t)$
|
|
and $f_i(t)$, the median over kernel-specific ratios is
|
|
shown.
|
|
\textbf{e}:~Ratios of standard deviation $\sigma_{c_i}$ of
|
|
each $c_i(t)$.
|
|
\textbf{f}:~Ratios of $\mu_{f_i}$.
|
|
\textbf{g}:~Distributions of kernel-specific $\sca$ that
|
|
correspond to $95\,\%$ curve span for $c_i(t)$ and
|
|
$f_i(t)$. Dots indicate the values from \textbf{b}.
|
|
}
|
|
\label{fig:pipeline_full}
|
|
\end{figure}
|
|
\FloatBarrier
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_invariance_short_Omocestus_rufipes.pdf}
|
|
\caption{\textbf{Step-wise emergence of intensity invariant song
|
|
representation along the model pathway without logarithmic
|
|
compression.}
|
|
Input $\raw(t)$ consists of song component $\soc(t)$
|
|
scaled by $\sca$ with added noise component $\noc(t)$ and
|
|
is processed up to the feature set $f_i(t)$, skipping
|
|
$\db(t)$. Different color shades indicate different types
|
|
of Gabor kernels with specific lobe number $\kn$ and
|
|
either $+$ or $-$ sign, sorted (dark to light) first by
|
|
increasing $\kn$ and then by
|
|
sign~($1\,\leq\,\kn\,\leq\,4$; first $+$, then $-$ for
|
|
each $\kn$; five kernel widths $\kw$ of 1, 2, 4, 8, and
|
|
$16\,$ms per type; 8 types, 40 kernels in total).
|
|
\textbf{a}:~Example representations of $\filt(t)$,
|
|
$\env(t)$, $\adapt(t)$, $c_i(t)$, and $f_i(t)$ for
|
|
different $\sca$.
|
|
\textbf{b}:~Intensity metrics over $\sca$. For $c_i(t)$
|
|
and $f_i(t)$, the median over kernels is shown. Dots
|
|
indicate $95\,\%$ curve span for $f_i(t)$.
|
|
\textbf{c}:~Average value $\mu_{f_i}$ of each feature
|
|
$f_i(t)$ over $\sca$.
|
|
\textbf{d}:~Ratios of intensity metrics to the respective
|
|
reference value for input $\raw(t)=\noc(t)$. For $c_i(t)$
|
|
and $f_i(t)$, the median over kernel-specific ratios is
|
|
shown.
|
|
\textbf{e}:~Ratios of $\mu_{f_i}$.
|
|
\textbf{f}:~Distribution of kernel-specific $\sca$ that
|
|
correspond to $95\,\%$ curve span for $f_i(t)$. Dots
|
|
indicate the value from \textbf{b}.
|
|
}
|
|
\label{fig:pipeline_short}
|
|
\end{figure}
|
|
\FloatBarrier
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_features_cross_species.pdf}
|
|
\caption{\textbf{Interspecific and intraspecific feature variability.}
|
|
Average value $\mu_{f_i}$ of each feature $f_i(t)$ against
|
|
its counterpart from a 2nd feature set based on a
|
|
different input $\raw(t)$. Each dot within a subplot
|
|
represents a single feature $f_i(t)$. Different color
|
|
shades indicate different types of Gabor kernels with
|
|
specific lobe number $\kn$ and either $+$ or $-$ sign,
|
|
sorted (dark to light) first by increasing $\kn$ and then
|
|
by sign~($1\,\leq\,\kn\,\leq\,4$; first $+$, then $-$ for
|
|
each $\kn$; five kernel widths $\kw$ of 1, 2, 4, 8, and
|
|
$16\,$ms per type; 8 types, 40 kernels in total). Data is
|
|
based on the analysis underlying
|
|
Fig\,\ref{fig:pipeline_full}.
|
|
\textbf{Lower triangular}:~Interspecific comparisons
|
|
between single songs of different species.
|
|
\textbf{Upper triangular}:~Intraspecific comparisons
|
|
between different songs of a single species (\textit{O.
|
|
rufipes}).
|
|
\textbf{Lower left}:~Distribution of correlation
|
|
coefficients $\rho$ for each interspecific and
|
|
intraspecific comparison. Dots indicate single $\rho$
|
|
values.
|
|
}
|
|
\label{fig:feat_cross_species}
|
|
\end{figure}
|
|
\FloatBarrier
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_invariance_field.pdf}
|
|
\caption{\textbf{Step-wise emergence of intensity invariant song
|
|
representation along the model pathway.}
|
|
}
|
|
\label{fig:pipeline_field}
|
|
\end{figure}
|
|
\FloatBarrier
|
|
|
|
\section{Conclusions \& outlook}
|
|
|
|
\textbf{Song recognition pathway: Grasshopper vs. model:}\\
|
|
The model pathway includes a rather large number of Gabor kernels compared to
|
|
the 15 to 20 ascending neurons in the grasshopper auditory
|
|
system~(\bcite{stumpner1991auditory}).
|
|
|
|
|
|
\textbf{Definition of invariance (general, systemic):}\\
|
|
Invariance = Property of a system to maintain a stable output with respect to a
|
|
set of relevant input parameters (variation to be represented) but irrespective
|
|
of one or more other parameters (variation to be discarded)
|
|
$\rightarrow$ Selective input-output decorrelation
|
|
|
|
\textbf{Definition of intensity invariance (context of neurons and songs):}\\
|
|
Intensity invariance = Time scale-selective sensitivity to certain faster
|
|
amplitude dynamics (song waveform, small-scale AM) and simultaneous
|
|
insensitivity to slower, more sustained amplitude dynamics (transient baseline,
|
|
large-scale AM, current overall intensity level)\\
|
|
$\rightarrow$ Without time scale selectivity, any fully intensity-invariant
|
|
output will be a flat line
|
|
|
|
|
|
\textbf{Log-HP: Implication for intensity invariance:}\\
|
|
- Logarithmic scaling is essential for equalizing different song intensities\\
|
|
$\rightarrow$ Intensity information can be manipulated more easily when in form
|
|
of a signal offset in log-space than a multiplicative scale in linear space
|
|
|
|
- Capability to compensate for intensity variations, i.e. selective amplification
|
|
of output $\adapt(t)$ relative to input $\env(t)$, is limited by input SNR (Eq.\,\ref{eq:toy_snr}):\\
|
|
$\rightarrow$ Ability to equalize between different sufficiently large scales of $s(t)$\\
|
|
$\rightarrow$ Inability to recover $s(t)$ when initially masked by noise floor $\eta(t)$
|
|
|
|
- Logarithmic scaling emphasizes small amplitudes (song onsets, noise floor) \\
|
|
$\rightarrow$ Recurring trade-off: Equalizing signal intensity vs preserving initial SNR
|
|
|
|
\textbf{Thresh-LP: Implication for intensity invariance:}\\
|
|
- Role of song periodicity for feature representation!
|
|
|
|
- Suggests a relatively simple rule for optimal choice of threshold value $\thr$:\\
|
|
$\rightarrow$ Find amplitude $c_i$ that maximizes absolute derivative of $c_i(t)$ over time\\
|
|
$\rightarrow$ Optimal with respect to intensity invariance of $f_i(t)$, not necessarily for
|
|
other criteria such as song-noise separation or diversity between features
|
|
|
|
- Nonlinear operations can be used to detach representations from graded physical
|
|
stimulus (to fasciliate categorical behavioral decision-making?):\\
|
|
1) Capture sufficiently precise amplitude information: $\env(t)$, $\adapt(t)$\\
|
|
$\rightarrow$ Closely following the AM of the acoustic stimulus\\
|
|
2) Quantify relevant stimulus properties on a graded scale: $c_i(t)$\\
|
|
$\rightarrow$ More decorrelated representation, compared to prior stages\\
|
|
3) Nonlinearity: Distinguish between "relevant vs irrelevant" values: $b_i(t)$\\
|
|
$\rightarrow$ Trading a graded scale for two or more categorical states\\
|
|
4) Represent stimulus properties under relevance constraint: $f_i(t)$\\
|
|
$\rightarrow$ Graded again but highly decorrelated from the acoustic stimulus\\
|
|
5) Categorical behavioral decision-making requires further nonlinearities\\
|
|
$\rightarrow$ Parameters of a behavioral response may be graded (e.g. approach speed),
|
|
initiation of one behavior over another is categorical (e.g. approach/stay)
|
|
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_noise_env_sd_conversion_appendix.pdf}
|
|
\caption{\textbf{}
|
|
}
|
|
\label{fig:app_env-sd}
|
|
\end{figure}
|
|
\FloatBarrier
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_invariance_rect-lp_appendix.pdf}
|
|
\caption{\textbf{}
|
|
}
|
|
\label{fig:app_rect-lp}
|
|
\end{figure}
|
|
\FloatBarrier
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_invariance_log-hp_appendix.pdf}
|
|
\caption{\textbf{}
|
|
}
|
|
\label{fig:app_log-hp_curves}
|
|
\end{figure}
|
|
\FloatBarrier
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_saturation_log-hp_appendix.pdf}
|
|
\caption{\textbf{}
|
|
}
|
|
\label{fig:app_log-hp_saturation}
|
|
\end{figure}
|
|
\FloatBarrier
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_invariance_thresh-lp_pure_appendix.pdf}
|
|
\caption{\textbf{}
|
|
}
|
|
\label{fig:app_thresh-lp_pure}
|
|
\end{figure}
|
|
\FloatBarrier
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_invariance_thresh-lp_noise_appendix.pdf}
|
|
\caption{\textbf{}
|
|
}
|
|
\label{fig:app_thresh-lp_noise}
|
|
\end{figure}
|
|
\FloatBarrier
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_kernel_sd_perc_thresh_lp_appendix.pdf}
|
|
\caption{\textbf{}
|
|
}
|
|
\label{fig:app_thresh-lp_kern-sd}
|
|
\end{figure}
|
|
\FloatBarrier
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_kernel_sd_perc_full_appendix.pdf}
|
|
\caption{\textbf{}
|
|
}
|
|
\label{fig:app_full_kern-sd}
|
|
\end{figure}
|
|
\FloatBarrier
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_kernel_sd_perc_short_appendix.pdf}
|
|
\caption{\textbf{}
|
|
}
|
|
\label{fig:app_short_kern-sd}
|
|
\end{figure}
|
|
\FloatBarrier
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_kernel_sd_perc_field_appendix.pdf}
|
|
\caption{\textbf{}
|
|
}
|
|
\label{fig:app_field_kern-sd}
|
|
\end{figure}
|
|
\FloatBarrier
|
|
|
|
|
|
\begin{figure}[!ht]
|
|
\centering
|
|
\includegraphics[width=\textwidth]{figures/fig_invariance_cross_species_thresh_appendix.pdf}
|
|
\caption{\textbf{}
|
|
}
|
|
\label{fig:app_cross_species_thresh}
|
|
\end{figure}
|
|
\FloatBarrier
|
|
|
|
\end{document} |