% Encoding: UTF-8
@COMMENT{BibTeX export based on data in FAU CRIS: https://cris.fau.de/}
@COMMENT{For any questions please write to cris-support@fau.de}
@inproceedings{faucris.124183224,
abstract = {The problem of 2D sound-source localization based on a robotic binaural setup and audio-motor learning is addressed. We first introduce a methodology to experimentally verify the existence of a locally-linear bijective mapping between sound-source positions and high-dimensional interaural data, using manifold learning. Based on this local linearity assumption, we propose an novel method, namely probabilistic piecewise affine regression, that learns the localization-to-interaural mapping and its inverse. We show that our method outperforms two state-of-the art mapping methods, and allows to achieve accurate 2D localization of natural sounds from real world binaural recording},
author = {Deleforge, Antoine and Horaud, Radu},
booktitle = {IEEE International Workshop on Machine Learning for Signal Processing (MLSP), 2012},
date = {2012-09-23/2012-09-26},
doi = {10.1109/MLSP.2012.6349784},
faupublication = {yes},
isbn = {978-1-4673-1026-0},
note = {LMS::1519},
peerreviewed = {No},
title = {{2D} sound-source localization on the binaural manifold},
venue = {Santander},
year = {2012}
}
@article{faucris.121869264,
abstract = {In this paper we address the problems of modeling the acoustic space generated by a full-spectrum sound source and of using the learned model for the localization and separation of multiple sources that simultaneously emit sparse-spectrum sounds. We lay theoretical and methodological grounds in order to introduce the binaural manifold paradigm. We perform an in-depth study of the latent low-dimensional structure of the high-dimensional interaural spectral data, based on a corpus recorded with a human-like audiomotor robot head. A non-linear dimensionality reduction technique is used to show that these data lie on a two-dimensional (2D) smooth manifold parameterized by the motor states of the listener, or equivalently, the sound source directions. We propose a probabilistic piecewise affine mapping model (PPAM) specifically designed to deal with high-dimensional data exhibiting an intrinsic piecewise linear structure. We derive a closed-form expectation-maximization (EM) procedure for estimating the model parameters, followed by Bayes inversion for obtaining the full posterior density function of a sound source direction. We extend this solution to deal with missing data and redundancy in real world spectrograms, and hence for 2D localization of natural sound sources such as speech. We further generalize the model to the challenging case of multiple sound sources and we propose a variational EM framework. The associated algorithm, referred to as variational EM for source separation and localization (VESSL) yields a Bayesian estimation of the 2D locations and time-frequency masks of all the sources. Comparisons of the proposed approach with several existing methods reveal that the combination of acoustic-space learning with Bayesian inference enables our method to outperform state-of-the-art method},
author = {Deleforge, Antoine and Forbes, Florence and Horaud, Radu},
doi = {10.1142/S0129065714400036},
faupublication = {yes},
journal = {International Journal of Neural Systems},
keywords = {binaural hearing; sound localization; sound-source seperation; manifold learning; mixture of regressors; EM inference},
note = {LMS::1521},
peerreviewed = {No},
title = {{Acoustic} space learning for sound-source separation and localization on binaural manifolds},
volume = {25},
year = {2015}
}
@inproceedings{faucris.110070664,
abstract = {We present a method for audio source separation and localization from binaural recordings. The method combines a new generative probabilistic model with time-frequency masking. We suggest that device-dependent relationships between point-source positions and interaural spectral cues may be learnt in order to constrain a mixture model. This allows to capture subtle separation and localization features embedded in the auditory data. We illustrate our method with data composed of two and three mixed speech signals in the presence of reverberations. Using standard evaluation metrics, we compare our method with a recent binaural-based source separation-localization algorith},
address = {Berlin, Heidelberg},
author = {Deleforge, Antoine and Horaud, Radu},
booktitle = {Proceedings of th 10th International Conference on Latent Variable Analysis and Source Separation (LVA/ICA), 2012},
date = {2012-03-12/2012-03-15},
doi = {10.1007/978-3-642-28551-6{\_}46},
faupublication = {yes},
isbn = {978-3-642-28551-6},
keywords = {sound source; source position; room impulse response; constrain mixture model; sound intensity level},
note = {LMS::1520},
pages = {372-379},
peerreviewed = {No},
publisher = {Springer},
title = {{A} latently constrained mixture model for audio source separation and localization},
venue = {Tel Aviv},
year = {2012}
}
@incollection{faucris.206972101,
address = {London},
author = {Deleforge, Antoine and Schmidt, Alexander and Kellermann, Walter},
booktitle = {Multimodal Behavior Analysis in the Wild},
doi = {10.1016/b978-0-12-814601-9.00012-2},
editor = {Xavier Alameda-Pineda, Elisa Ricci, Nicu Sebe},
faupublication = {yes},
isbn = {978-0-12-814601-9},
pages = {27-52},
peerreviewed = {Yes},
publisher = {Academic Press},
series = {Computer Vision and Pattern Recognition Series},
title = {{Audio}-motor integration for robot audition},
url = {https://www.elsevier.com/books/multimodal-behavior-analysis-in-the-wild/alameda-pineda/978-0-12-814601-9},
year = {2018}
}
@inproceedings{faucris.123706484,
abstract = {The construction of a humanoid robot, which can communicate with humans
in a natural manner, is a worthwhile and challenging task alike. This
paper discusses some major difficulties encountered in acoustic signal
acquisition by the widely used humanoid robot NAO and the implications
for the design of the signal enhancement algorithms that are needed for
human-robot communication. Measurements with this low-cost robot, whose
microphones and loudspeakers are mounted in the head, show the
challenges for the noise reduction and acoustic echo control (AEC) due
to ego-noise and nonlinear loudspeaker characteristics. It is also
discussed how peripheral microphones at the limbs of the robot can
mitigate these problems and offer new prospects for multi-channel signal
enhancemen},
author = {Löllmann, Heinrich and Barfuß, Hendrik and Deleforge, Antoine and Kellermann, Walter},
booktitle = {11. ITG Fachtagung Sprachkommunikation},
date = {2014-09-24/2014-09-26},
faupublication = {yes},
isbn = {978-3-8007-3640-9},
note = {LMS::1440},
pages = {1-4},
peerreviewed = {No},
publisher = {VDE},
title = {{Challenges} in {Acoustic} {Signal} {Enhancement} for {Human}-{Robot} {Communication}},
url = {http://lms.lnt.de/itgspeech2014/},
venue = {Erlangen},
year = {2014}
}
@inproceedings{faucris.109279324,
abstract = {We address the problem of ego-noise reduction, i.e., suppressing the noise a robot causes by its own motions. Such noise degrades the recorded microphone signal massively such that the robot’s auditory capabilities suffer. To suppress it, it is intuitive to use also motor data, since it provides additional information about the robot’s joints and thereby the noise sources. We propose to fuse motor data to a recently proposed multichannel dictionary algorithm for ego-noise reduction. At training, a dictionary is learned that captures spatial and spectral characteristics of ego-noise. At testing, nonlinear classifiers are used to efficiently associate the current robot’s motor state to relevant sets of entries in the learned dictionary. By this, computational load is reduced by one third in typical scenarios while achieving at least the same noise reduction performance. Moreover, we propose to train dictionaries on different microphone array geometries and use them for ego-noise reduction while the head to which the microphones are mounted is moving. In such scenarios, the motor guidedapproach results in significantly better performance values},
author = {Schmidt, Alexander and Deleforge, Antoine and Kellermann, Walter},
booktitle = {IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)},
date = {2016-10-09/2016-10-14},
doi = {10.1109/IROS.2016.7759212},
faupublication = {yes},
isbn = {978-1-5090-3762-9},
pages = {1281-1286},
peerreviewed = {Yes},
title = {{Ego}-noise reduction using a motor data-guided multichannel dictionary},
venue = {Daejon},
year = {2016}
}
@article{faucris.122957164,
abstract = {In this work we address the problem of approximating high-dimensional data with a low-dimensional representation. We make the following contributions. We propose an inverse regression method which exchanges the roles of input and response, such that the low-dimensional variable becomes the regressor, and which is tractable. We introduce a mixture of locally-linear probabilistic mapping model that starts with estimating the parameters of inverse regression, and follows with inferring closed-form solutions for the forward parameters of the high-dimensional regression problem of interest. Moreover, we introduce a partially-latent paradigm, such that the vector-valued response variable is composed of both observed and latent entries, thus being able to deal with data contaminated by experimental artifacts that cannot be explained with noise models. The proposed probabilistic formulation could be viewed as a latent-variable augmentation of regression. We devise expectation-maximization (EM) procedures based on a data augmentation strategy which facilitates the maximum-likelihood search over the model parameters. We propose two augmentation schemes and we describe in detail the associated EM inference procedures that may well be viewed as generalizations of a number of EM regression, dimension reduction, and factor analysis algorithms. The proposed framework is validated with both synthetic and real data. We provide experimental evidence that our method outperforms several existing regression technique},
author = {Deleforge, Antoine and Forbes, Florence and Horaud, Radu},
doi = {10.1007/s11222-014-9461-5},
faupublication = {yes},
journal = {Statistics and Computing},
note = {LMS::1522},
pages = {1-19},
peerreviewed = {No},
title = {{High}-dimensional regression with gaussian mixtures and partially-latent response variables},
volume = {25},
year = {2014}
}
@inproceedings{faucris.123991384,
abstract = {The analysis of hyper-spectral images is often needed to recover physical properties of planets. To address this inverse problem, the use of learning methods have been considered with the advantage that, once a relationship between physical parameters and spectra has been established through training, the learnt relationship can be used to estimate parameters from new images underpinned by the same physical model. Within this framework, we propose a partially-latent regression method which maps high-dimensional inputs (spectral images) onto low-dimensional responses (physical parameters). We introduce a novel regression method that combines a Gaussian mixture of locally-linear mappings with a partially-latent variable model. While the former makes high-dimensional regression tractable, the latter enables to deal with physical parameters that cannot be observed or, more generally, with data contaminated by experimental artifacts that cannot be explained with noise models. The method is illustrated on images collected from the Mars plane},
author = {Deleforge, Antoine and Forbes, Florence and Horaud, Radu},
booktitle = {22nd European Signal Processing Conference (EUSIPCO)},
date = {2014-09-01/2014-09-05},
faupublication = {yes},
isbn = {978-0-9928-6261-9},
keywords = {hyper-spectral images; regression; dimension reduction; mixture models; latent variable model},
note = {LMS::1517},
pages = {1037-1048},
peerreviewed = {No},
title = {{Hyper}-spectral image analysis with partially-latent regression},
venue = {Lisbon},
year = {2014}
}
@inproceedings{faucris.120107944,
abstract = {We propose a novel method for mapping sound spectrograms onto images and thus enabling alignment between auditory and visual features for subsequent multimodal processing. We suggest a supervised learning approach to this audio-visual fusion problem, on the following grounds. Firstly, we use a Gaussian mixture of locally-linear regressions to learn a mapping from image locations to binaural spectrograms. Secondly, we derive a closed-form expression for the conditional posterior probability of an image location, given both an observed spectrogram, emitted from an unknown source direction, and the mapping parameters that were previously learnt. Prominently, the proposed method is able to deal with completely different spectrograms for training and for alignment. While fixed-length wide-spectrum sounds are used for learning, thus fully and robustly estimating the regression, variable-length sparse-spectrum sounds, e.g., speech, are used for alignment. The proposed method successfully extracts the image location of speech utterances in realistic reverberant-room scenario},
author = {Deleforge, Antoine and Drouard, Vincent and Girin, Laurent and Horaud, Radu},
booktitle = {22nd European Signal Processing Conference (EUSIPCO)},
date = {2014-09-01/2014-09-05},
faupublication = {yes},
isbn = {978-0-9928-6261-9},
note = {LMS::1518},
pages = {2470-2474},
peerreviewed = {No},
title = {{Mapping} sounds onto images using binaural spectrograms},
venue = {Lisbon},
year = {2014}
}
@inproceedings{faucris.118040164,
abstract = {We propose a novel sparse representation for heavily underdetermined multichannel sound mixtures, i.e., with much more sources than microphones. The proposed approach operates in the complex Fourier domain, thus preserving spatial characteristics carried by phase differences. We derive a generalization of K-SVD which jointly estimates a dictionary capturing both spectral and spatial features, a sparse activation matrix, and all instantaneous source phases from a set of signal examples. This dictionary can be used to extract the learned signal from a new input mixture. The method is applied to the challenging problem of ego-noise reduction for robot audition. We demonstrate its superiority relative to conventional dictionary-based techniques using real-room recording},
author = {Deleforge, Antoine and Kellermann, Walter},
booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
date = {2015-04-19/2015-04-24},
doi = {10.1109/ICASSP.2015.7177990},
faupublication = {yes},
isbn = {978-1-4673-6997-8},
pages = {355-359},
peerreviewed = {Yes},
title = {{Phase}-optimized {K}-{SVD} for signal extraction from underdetermined multichannel sparse mixtures},
venue = {Brisbane},
year = {2015}
}
@article{faucris.120065704,
abstract = {This paper studies the statistical performance of the multichannel Wiener filter (MWF) when the weights are computed using estimates of the sample covariance matrices of the noisy and the noise signals. It is well known that the optimal weights of the minimum variance distortionless response beamformer are only determined by the noisy sample covariance matrix or the noise sample covariance matrix, while those of the MWF are determined by both of them. Therefore, the difficulty increases dramatically in statistically analyzing the MWF when compared to analyzing the MVDR, where the main reason is that expressing the general joint probability density function (p.d.f.) of the two sample covariance matrices presented a Hitherto unsolved problem, to the best of our knowledge. For a deeper insight into the statistical performance of the MWF, this paper first introduces a bivariate normal distribution to approximately model the joint p.d.f. of the noisy and the noise sample covariance matrices. Each sample covariance matrix is approximately modeled by a random scalar multiplied by its true covariance matrix. This approximation is designed to preserve both the bias and the mean squared error of the matrix with respect to a natural distance on covariance matrices. The correlation of the bivariate normal distribution, referred to as the sample covariance matrices intrinsic correlation coefficient, captures all second-order dependencies of the noisy and the noise sample covariance matrices. By using the proposed bivariate normal distribution, the performance of the MWF can be predicted from the derived analytical expressions and many interesting results are revealed. As an example, the theoretical analysis demonstrates that the MWF performance may degrade in terms of noise reduction and signal-to-noise-ratio improvement when using more sensors in some noise scenario},
author = {Zheng, Chengshi and Deleforge, Antoine and Li, Xiaodong and Kellermann, Walter},
doi = {10.1109/TASLP.2018.2800283},
faupublication = {yes},
journal = {IEEE/ACM Transactions on Audio, Speech and Language Processing},
keywords = {Bivariate normal distribution; multichannel Wiener filter; sample covariance matrix; statistical analysis},
pages = {951-966},
peerreviewed = {Yes},
title = {{Statistical} analysis of the multichannel {Wiener} filter using a bivariate normal distribution for sample covariance matrices},
url = {http://ieeexplore.ieee.org/document/8276308/},
volume = {26},
year = {2018}
}
@inproceedings{faucris.107695544,
abstract = {Human-robot communication is often faced with the difficult problem of interpreting ambiguous auditory data. For example, the acoustic signals perceived by a humanoid with its on-board microphones contain a mix of sounds such as speech, music, electronic devices, all in the presence of attenuation and reverberations. In this paper we propose a novel method, based on a generative probabilistic model and on active binaural hearing, allowing a robot to robustly perform sound-source separation and localization. We show how interaural spectral cues can be used within a constrained mixture model specifically designed to capture the richness of the data gathered with two microphones mounted onto a human-like artificial head. We describe in detail a novel EM algorithm, we analyse its initialization, speed of convergence and complexity, and we assess its performance with both simulated and real dat},
author = {Deleforge, Antoine and Horaud, Radu},
booktitle = {Proceedings of the seventh annual ACM/IEEE international conference on Human-Robot Interaction},
date = {2012-03-05/2012-03-08},
doi = {10.1145/2157689.2157834},
faupublication = {yes},
isbn = {978-1-4503-1063-5},
keywords = {blind source seperation; computational auditory scene analysis; EM algorithm; learning},
note = {LMS::1514},
peerreviewed = {No},
title = {{The} cocktail party robot: {Sound} source separation and localisation with an active binaural head},
venue = {Boston},
year = {2012}
}
@inproceedings{faucris.123441164,
abstract = {We propose a natural way to generalize relative transfer functions (RTFs) to more than one source. We first prove that such a generalization is not possible using a single multichannel spectro-temporal observation, regardless of the number of microphones. We then introduce a new transform for multichannel multi-frame spectrograms, i.e., containing several channels and time frames in each time-frequency bin. This transform allows a natural generalization which satisfies the three key properties of RTFs, namely, they can be directly estimated from observed signals, they capture spatial properties of the sources and they do not depend on emitted signals. Through simulated experiments, we show how this new method can localize multiple simultaneously active sound sources using short spectro-temporal windows, without relying on source separatio},
author = {Deleforge, Antoine and Gannot, Sharon and Kellermann, Walter},
booktitle = {23rd European Signal Processing Conference (EUSIPCO)},
date = {2015-08-31/2015-09-04},
doi = {10.1109/EUSIPCO.2015.7362417},
faupublication = {yes},
isbn = {978-0-9928-6263-3},
keywords = {Grassmannian manifolds; Multiple sound sources localization; Plucker Embedding; Relative Transfer Function},
pages = {419-423},
peerreviewed = {Yes},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
title = {{Towards} a generalization of relative transfer functions to more than one source},
venue = {Nice},
year = {2015}
}
@inproceedings{faucris.124093024,
abstract = {The sound-source separation and localization (SSL) problems are addressed within a unified formulation. Firstly, a mapping between white-noise source locations and binaural cues is estimated. Secondly, SSL is solved via Bayesian inversion of this mapping in the presence of multiple sparse-spectrum emitters (such as speech), noise and reverberations. We propose a variational EM algorithm which is described in detail together with initialization and convergence issues. Extensive real-data experiments show that the method outperforms the state-of-the-art both in separation and localization (azimuth and elevation},
author = {Deleforge, Antoine and Forbes, Florence and Horaud, Radu},
booktitle = {IEEE International Conference on Acoustics, Speech and Signal Processing},
date = {2013-05-26/2013-05-31},
doi = {10.1109/ICASSP.2013.6637612},
faupublication = {yes},
isbn = {978-1-4799-0356-6},
note = {LMS::1516},
pages = {76-80},
peerreviewed = {No},
title = {{Variational} {EM} for binaural sound-source separation and localization},
venue = {Vancouver},
year = {2013}
}