% Encoding: UTF-8
@COMMENT{BibTeX export based on data in FAU CRIS: https://cris.fau.de/}
@COMMENT{For any questions please write to cris-support@fau.de}
@inproceedings{faucris.224413502,
author = {Liem, Cynthia C.S. and Müller, Meinard and Eck, Douglas and Tzanetakis, George},
booktitle = {Proceedings of the International Conference on Multimedia (ACM Multimedia)},
faupublication = {yes},
pages = {603-604},
peerreviewed = {unknown},
title = {1st {International} {ACM} {Workshop} on {Music} {Information} {Retrieval} with {User}-centered and {Multimodal} {Strategies} ({MIRUM})},
year = {2011}
}
@inproceedings{faucris.224408431,
author = {Liem, Cynthia C.S. and Müller, Meinard and Tjoa, Steven K. and Tzanetakis, George},
booktitle = {Proceedings of the International Conference on Multimedia (ACM Multimedia)},
faupublication = {yes},
pages = {1509-1510},
peerreviewed = {unknown},
title = {2nd {International} {ACM} {Workshop} on {Music} {Information} {Retrieval} with {User}-centered and {Multimodal} {Strategies} ({MIRUM})},
year = {2012}
}
@article{faucris.212135249,
author = {El Baba, Youssef and Walther, Andreas and Habets, Emanuël},
doi = {10.1109/TASLP.2017.2784298},
faupublication = {yes},
journal = {IEEE/ACM Transactions on Audio, Speech and Language Processing},
keywords = {reflector localization; room geometry inference; Image model; reflection point localization; TOA disambiguation; echo labeling},
pages = {857-872},
peerreviewed = {Yes},
title = {{3D} room geometry inference based on room impulse response stacks},
volume = {26},
year = {2018}
}
@article{faucris.240320587,
abstract = {Sound reproduction systems may highly benefit from detailed knowledge of the acoustic space to enhance the spatial sound experience. This article presents a room geometry inference method based on identification of reflective boundaries using a high-resolution direction-of-arrival map produced via room impulse responses (RIRs) measured with a linear loudspeaker array and a single microphone. Exploiting the sparse nature of the early part of the RIRs, Elastic Net regularization is applied to obtain a 2D polar-coordinate map, on which the direct path and early reflections appear as distinct peaks, described by their propagation distance and direction of arrival. Assuming a separable room geometry with four side-walls perpendicular to the floor and ceiling, and imposing pre-defined geometrical constraints on the walls, the 2D-map is segmented into six regions, each corresponding to a particular wall. The salient peaks within each region are selected as candidates for the first-order wall reflections, and a set of potential room geometries is formed by considering all possible combinations of the associated peaks. The room geometry is then inferred using a cost function evaluated on the higher-order reflections computed via beam tracing. The proposed method is tested with both simulated and measured data.},
author = {Tuna, Cagdas and Canclini, Antonio and Borra, Federico and Gotz, Philipp and Antonacci, Fabio and Walther, Andreas and Sarti, Augusto and Habets, Emanuël},
doi = {10.1109/TASLP.2020.2998299},
faupublication = {yes},
journal = {IEEE/ACM Transactions on Audio, Speech and Language Processing},
keywords = {DOA estimation; echo labeling; reflector localization; room geometry inference},
note = {CRIS-Team Scopus Importer:2020-07-10},
pages = {1729-1744},
peerreviewed = {Yes},
title = {{3D} {Room} {Geometry} {Inference} {Using} a {Linear} {Loudspeaker} {Array} and a {Single} {Microphone}},
volume = {28},
year = {2020}
}
@inproceedings{faucris.212135631,
address = {Aalborg, Denmark},
author = {Jarrett, Daniel Phillip and Habets, Emanuël and Naylor, Patrick},
booktitle = {Proc. European Signal Processing Conf. (EUSIPCO)},
faupublication = {no},
pages = {442-446},
peerreviewed = {unknown},
publisher = {IEEE},
title = {{3D} source localization in the spherical harmonic domain using a pseudointensity vector},
url = {https://www.scopus.com/record/display.uri?eid=2-s2.0-84863799475∨igin=inward},
year = {2010}
}
@article{faucris.212135948,
author = {Chakrabarty, Soumitro and Habets, Emanuël},
doi = {10.1109/TASLP.2017.2752364},
faupublication = {yes},
journal = {IEEE/ACM Transactions on Audio, Speech and Language Processing},
keywords = {robustness.; Spatial filtering; Bayesian beamforming; EM algorithm; direction-of-arrival (DOA) uncertainty},
pages = {145-160},
peerreviewed = {Yes},
title = {{A} {Bayesian} {Approach} to {Informed} {Spatial} {Filtering} with {Robustness} {Against} {DOA} {Estimation} {Errors}},
volume = {26},
year = {2018}
}
@inproceedings{faucris.212136326,
address = {South Brisbane, Australia},
author = {Chakrabarty, Soumitro and Thiergart, Oliver and Habets, Emanuël},
booktitle = {Proc. IEEE Intl. Conf. on Acoustics, Speech and Signal Processing (ICASSP)},
doi = {10.1109/ICASSP.2015.7178070},
faupublication = {yes},
isbn = {9781467369978},
keywords = {dereverberation; Bayesian beamforming; microphone array processing},
pages = {753-757},
peerreviewed = {unknown},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
title = {{A} {Bayesian} approach to spatial filtering and diffuse power estimation for joint dereverberation and noise reduction},
year = {2015}
}
@inproceedings{faucris.113453824,
abstract = {When the overall listening experience is rated, listeners are asked to take every aspect into account which seems important to them, including song, lyrics, mood and audio quality. The results of two previously conducted experiments revealed a significant influence of the signal bandwidth and the spatial reproduction format on the overall listening experience. In this work, a systematic analysis is applied to the results of these two experiments with the purpose to investigate listeners in more detail. Regarding rating the overall listening experience, the results show that listeners can rather be described by continuous variables which reflect their preferences than clear categorizations of different listener types. Furthermore, a regression model for predicting ratings was significantly improved by describing the listeners with such continuous variables. Copyright:},
author = {Herre, Jürgen and Schöffler, Michael},
booktitle = {Proceedings of Sound and Music Computing Conference 2014},
date = {2014-09-14/2014-09-20},
faupublication = {yes},
isbn = {9789604661374},
note = {Vorträge und Veröffentlichungen auf Konferenzen{\_}subjektive Audioqualität},
pages = {886-892},
peerreviewed = {unknown},
publisher = {National and Kapodistrian University of Athens},
title = {{About} the {Different} {Types} of {Listeners} for {Rating} the {Overall} {Listening} {Experience}},
url = {https://www.audiolabs-erlangen.de/fau/assistant/schoeffler/publications},
venue = {Athens},
year = {2014}
}
@inproceedings{faucris.306454017,
author = {Schöffler, Michael and Herre, Jürgen},
booktitle = {Proceedings of SMC Conference},
faupublication = {yes},
note = {Vorträge und Veröffentlichungen auf Konferenzen{\_}Subjektive Audioqualität},
peerreviewed = {unknown},
title = {{About} the {Impact} of {Audio} {Quality} on {Overall} {Listening} {Experience}},
venue = {Stockholm},
year = {2013}
}
@inproceedings{faucris.306041539,
author = {Herre, Jürgen},
booktitle = {Workshop: "W8 - ISO/MPEG- Audio - The New Standard for Universal Spatial / 3D Audio Coding"},
faupublication = {yes},
note = {Eingeladene Konferenz-Workshop-Vorträge, Tutorials},
peerreviewed = {unknown},
title = {{A} {Brief} {Introduction} to {MPEG}-{H} {3DA} {Standardization}},
venue = {New York},
year = {2015}
}
@inproceedings{faucris.306041891,
author = {Herre, Jürgen},
booktitle = {Workshop "W2 - ISO/MPEG-H Audio - The New Standard for Universal Spatial / 3D Audio Coding"},
faupublication = {yes},
note = {Eingeladene Konferenz-Workshop-Vorträge, Tutorials},
peerreviewed = {unknown},
title = {{A} {Brief} {Introduction} to {MPEG}-{H} {3DA} {Standardization}},
venue = {Warsaw},
year = {2015}
}
@article{faucris.224396685,
author = {Schreiber, Hendrik and Müller, Meinard},
doi = {10.1109/TMM.2014.2318517},
faupublication = {yes},
journal = {IEEE Transactions on Multimedia},
pages = {1654-1664},
peerreviewed = {Yes},
title = {{Accelerating} {Index}-{Based} {Audio} {Identification}},
volume = {16},
year = {2014}
}
@inproceedings{faucris.212136629,
address = {Edinburgh, UK},
author = {Schlecht, Sebastian and Habets, Emanuël},
booktitle = {Proc. Conf. on Digital Audio Effects},
faupublication = {yes},
pages = {337-344},
peerreviewed = {unknown},
publisher = {University of Edinburgh},
title = {{Accurate} reverberation time control in feedback delay networks},
url = {https://www.scopus.com/record/display.uri?eid=2-s2.0-85030261358∨igin=inward},
year = {2017}
}
@inproceedings{faucris.305595434,
abstract = {Audio watermarking enables the robust and imperceptible transmission of data within audio signals. Among the many possible applications of this technique, a number of scenarios require direct embedding of watermarks into a compressed signal representation. Such bitstream watermarking systems enable e.g. on-the-fly embedding of transaction specific data at the time of content delivery via the Internet. This paper extends previous work on bitstream watermarking for MPEG-2/4 Advanced Audio Coding (AAC) towards a compatible family of watermarking schemes for MPEG-1/2 Layer-3 (MP3), MPEG-2/4 AAC and uncompressed audio. Regardless of the format used during data embedding, the same watermark extractor can be employed to recover the embedded message. Both the underlying concepts and relevant experimental results for these schemes are described.
Download
A-
Decrease article font size
A+
Increase article font size
Cite this article
Print this article
Alt.
DisplayAlternative display
Share:
Share on Facebook
Share on X
Share on LinkedIn
Share as email
Dataset articles
A Dataset of Larynx Microphone Recordings for Singing Voice Reconstruction
Authors:
Simon SchwärEmail Simon Schwär
Michael Krause
Michael Fast
Sebastian Rosenzweig
Frank Scherbaum
Meinard Müller
Toggle author information panel
Abstract
Larynx microphones (LMs) make it possible to obtain practically crosstalk-free recordings of the human voice by picking up vibrations directly from the throat. This can be useful in a multitude of music information retrieval scenarios related to singing, e.g., the analysis of individual voices recorded in environments with lots of interfering noise. However, LMs have a limited frequency range and barely capture the effects of the vocal tract, which makes the recorded signal unsuitable for downstream tasks that require high-quality recordings. In this paper, we introduce the task of reconstructing a natural sounding, high-quality singing voice recording from an LM signal. With an explicit focus on the singing voice, the problem lies at the intersection of speech enhancement and singing voice synthesis with the additional requirement of faithful reproduction of expressive parameters like intonation. In this context, we make three main contributions. First, we publish a dataset with over 4 hours of popular music we recorded with four amateur singers accompanied by a guitar, where both LM and clean close-up microphone signals are available. Second, we propose a data-driven baseline approach for singing voice reconstruction from LM signals using differentiable signal processing, inspired by a source-filter model that emulates the missing vocal tract effects. Third, we evaluate the baseline with a listening test and further show that it can improve the accuracy of lyrics transcription as an exemplary downstream tas},
author = {Schwär, Simon and Krause, Michael and Fast, Michael and Rosenzweig, Sebastian and Scherbaum, Frank and Müller, Meinard},
doi = {10.5334/tismir.166},
faupublication = {yes},
journal = {Transactions of the International Society for Music Information Retrieval},
keywords = {Larynx Microphone; Singing Voice Reconstruction; Dataset; Differentiable Signal Processing; Singing Analysis},
pages = {30-43},
peerreviewed = {Yes},
title = {{A} {Dataset} of {Larynx} {Microphone} {Recordings} for {Singing} {Voice} {Reconstruction}},
volume = {7},
year = {2024}
}
@inproceedings{faucris.224430182,
address = {Vienna, Austria},
author = {Fremerey, Christian and Kurth, Frank and Müller, Meinard and Clausen, Michael},
booktitle = {Proceedings of the International Conference on Music Information Retrieval (ISMIR)},
faupublication = {yes},
pages = {131-132},
peerreviewed = {unknown},
title = {{A} {Demonstration} of the {SyncPlayer} {System}},
year = {2007}
}
@inproceedings{faucris.318868458,
author = {Schwär, Simon and Rosenzweig, Sebastian and Müller, Meinard},
booktitle = {Proceedings of the International Society for Music Information Retrieval Conference (ISMIR)},
doi = {10.5281/zenodo.5624601},
faupublication = {yes},
pages = {626-633},
peerreviewed = {Yes},
title = {{A} {Differentiable} {Cost} {Measure} for {Intonation} {Processing} in {Polyphonic} {Music}},
year = {2021}
}
@article{faucris.224404248,
author = {Damm, David and Fremerey, Christian and Thomas, Verena and Clausen, Michael and Kurth, Frank and Müller, Meinard},
faupublication = {yes},
journal = {International Journal on Digital Libraries},
pages = {53-71},
peerreviewed = {unknown},
title = {{A} digital library framework for heterogeneous music collections: from document acquisition to cross-modal interaction},
volume = {12},
year = {2012}
}
@inproceedings{faucris.213614601,
address = {Tel Aviv, Israel},
author = {Habets, Emanuël},
booktitle = {Proc. Intl. Workshop Acoust. Echo Noise Control (IWAENC)},
faupublication = {no},
peerreviewed = {unknown},
title = {{A} distortionless subband beamformer for noise reduction in reverberant environments},
year = {2010}
}
@inproceedings{faucris.287477126,
address = {NEW YORK},
author = {Gupta, Kishan and Korse, Srikanth and Edler, Bernd and Fuchs, Guillaume},
booktitle = {2022 IEEE INTERNATIONAL CONFERENCE ON ACOUSTICS, SPEECH AND SIGNAL PROCESSING (ICASSP)},
doi = {10.1109/ICASSP43922.2022.9747410},
faupublication = {yes},
month = {Jan},
note = {CRIS-Team WoS Importer:2023-01-13},
pages = {836-840},
peerreviewed = {unknown},
publisher = {IEEE},
title = {{A} {DNN} {Based} {Post}-{Filter} to {Enhance} the {Quality} of {Coded} {Speech} in {MDCT} {Domain}},
venue = {Singapore},
year = {2022}
}
@inproceedings{faucris.305816683,
author = {Hellmuth, Oliver and Allamanche, E and Herre, Jürgen and Kastner, Thorsten and Cremer, M and Hirsch, W},
booktitle = {111th AES Convention},
faupublication = {no},
note = {herre{\_}papers{\_}audio{\_}watermarking},
peerreviewed = {unknown},
title = {{Advanced} {Audio} {Identification} {Using} {MPEG}-7 {Content} {Description}},
venue = {New York},
volume = {Preprint 5463},
year = {2001}
}
@inproceedings{faucris.116598724,
address = {Vienna},
author = {Herre, Jürgen and Eberlein, Ernst and Schott, Hartmut and Brandenburg, Karlheinz},
faupublication = {no},
note = {Vorträge und Veröffentlichungen auf Konferenzen{\_}subjektive Audioqualität},
pages = {-},
peerreviewed = {unknown},
publisher = {AES},
title = {{Advanced} {Audio} {Measurement} {System} using {Psychoacoustic} {Properties}},
volume = {Preprint 3332},
year = {1992}
}
@inproceedings{faucris.305745110,
author = {Neubauer, Christian and Herre, Jürgen},
booktitle = {109th AES Convention},
faupublication = {no},
note = {herre{\_}papers{\_}audio{\_}watermarking},
peerreviewed = {unknown},
title = {{Advanced} {Watermarking} and its {Applications}},
venue = {Los Angeles},
volume = {Preprint 5176},
year = {2000}
}
@inproceedings{faucris.224440009,
address = {Honolulu, Hawaii, USA},
author = {Clausen, Michael and Müller, Meinard},
booktitle = {Proceedings of the International Symposium on Applied Algebra, Algebraic Algorithms and Error-Correcting Codes (AAECC)},
faupublication = {yes},
pages = {29-42},
peerreviewed = {unknown},
title = {{A} {Fast} {Program} {Generator} of {Fast} {Fourier} {Transforms}},
year = {1999}
}
@inproceedings{faucris.266499116,
address = {NEW YORK},
author = {Strauß, Martin and Edler, Bernd},
booktitle = {2021 IEEE INTERNATIONAL CONFERENCE ON ACOUSTICS, SPEECH AND SIGNAL PROCESSING (ICASSP 2021)},
doi = {10.1109/ICASSP39728.2021.9413999},
faupublication = {yes},
month = {Jan},
note = {CRIS-Team WoS Importer:2021-11-26},
pages = {5754-5758},
peerreviewed = {unknown},
publisher = {IEEE},
title = {{A} {FLOW}-{BASED} {NEURAL} {NETWORK} {FOR} {TIME} {DOMAIN} {SPEECH} {ENHANCEMENT}},
venue = {Toronto, ON},
year = {2021}
}
@inproceedings{faucris.224425909,
address = {Aarhus, Denmark},
author = {Kurth, Frank and Damm, David and Fremerey, Christian and Müller, Meinard and Clausen, Michael},
booktitle = {Proceedings of the European Conference on Research and Advanced Technology for Digital Libraries (ECDL)},
faupublication = {yes},
isbn = {978-3-540-87598-7},
pages = {334-345},
peerreviewed = {unknown},
publisher = {Springer},
series = {Lecture Notes In Computer Science},
title = {{A} {Framework} for {Managing} {Multimodal} {Digitized} {Music} {Collections}},
volume = {5173},
year = {2008}
}
@inproceedings{faucris.302109267,
author = {Westphal, Susanne and Schöffler, Michael and Herre, Jürgen},
booktitle = {Workshop für Innovative Computerbasierte Musikinterfaces (ICMI)},
faupublication = {yes},
note = {herre{\_}papers{\_}subjective{\_}audio{\_}quality},
peerreviewed = {unknown},
title = {{A} {Framework} for {Reporting} {Spatial} {Attributes} of {Sound} {Sources}},
url = {http://icmi-workshop.org/papers/2015/spacialattributes.pdf},
year = {2015}
}
@article{faucris.212137193,
author = {Levin, D. and Habets, Emanuël and Gannot, Sharon},
doi = {10.1109/LSP.2013.2271722},
faupublication = {yes},
journal = {IEEE Signal Processing Letters},
keywords = {array signal processing; Antenna arrays; microphone arrays},
pages = {877-880},
peerreviewed = {Yes},
title = {{A} generalized theorem on the average array directivity factor},
volume = {20},
year = {2013}
}
@inproceedings{faucris.213633855,
address = {Liberec, Czech Republic},
author = {Khan, A. H. and Taseska, Maja and Habets, Emanuël},
booktitle = {Proc. Intl. Conf. on Latent Variable Analysis and Signal Separation},
doi = {10.1007/978-3-319-22482-4{\_}46},
faupublication = {yes},
pages = {396--403},
peerreviewed = {unknown},
publisher = {Springer International Publishing},
title = {{A} geometrically constrained independent vector analysis algorithm for online source extraction},
year = {2015}
}
@inproceedings{faucris.224396092,
address = {Malága, Spain},
author = {Balke, Stefan and Müller, Meinard},
booktitle = {Demos and Late Breaking News of the International Society for Music Information Retrieval Conference (ISMIR)},
faupublication = {yes},
peerreviewed = {unknown},
title = {{A} {Graphical} {User} {Interface} for {Understanding} {Audio} {Retrieval} {Results}},
year = {2015}
}
@inproceedings{faucris.282434270,
address = {BAIXAS},
author = {Strauß, Martin and Paulus, Jouni and Torcoli, Matteo and Edler, Bernd},
booktitle = {INTERSPEECH 2021},
doi = {10.21437/Interspeech.2021-1418},
faupublication = {yes},
month = {Jan},
note = {CRIS-Team WoS Importer:2022-09-30},
pages = {3900-3904},
peerreviewed = {unknown},
publisher = {ISCA-INT SPEECH COMMUNICATION ASSOC},
title = {{A} {Hands}-on {Comparison} of {DNNs} for {Dialog} {Separation} {Using} {Transfer} {Learning} from {Music} {Source} {Separation}},
venue = {Brno},
year = {2021}
}
@inproceedings{faucris.289296687,
abstract = {State-of-the-art acoustic echo and noise reduction combines adaptive filters with a deep neural network-based postfilter. While the signal-to-distortion ratio is often used for training, it is not well-defined for all echo-reduction scenarios. We propose well-defined loss functions for training and modifications of a recently proposed echo reduction system that is based on informed source extraction. The modifications include using a Kalman filter as a prefilter and a cyclical learning rate scheduler. The proposed modifications improve the performance on the blind test set of the Interspeech 2021 AEC challenge. A comparison to the challenge-winner shows that the proposed system underperforms the winner by 0.1 mean opinion score (MOS) points in double-talk echo reduction. However, it outperforms the winner by 0.3 MOS points in echo-only echo reduction. In all other scenarios, both algorithms perform comparably. },
author = {Mack, Wolfgang and Habets, Emanuël},
booktitle = {2022 IEEE Spoken Language Technology Workshop, SLT 2022 - Proceedings},
date = {2023-01-09/2023-01-12},
doi = {10.1109/SLT54892.2023.10023206},
faupublication = {yes},
isbn = {9798350396904},
keywords = {Acoustic Echo Reduction; DNN; Kalman Filter; Masking; Speech Enhancement},
note = {CRIS-Team Scopus Importer:2023-02-17},
pages = {502-508},
peerreviewed = {unknown},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
title = {{A} {Hybrid} {Acoustic} {Echo} {Reduction} {Approach} {Using} {Kalman} {Filtering} and {Informed} {Source} {Extraction} with {Improved} {Training}},
venue = {Doha, QAT},
year = {2023}
}
@article{faucris.224432090,
author = {Müller, Meinard and Kurth, Frank and Clausen, Michael},
faupublication = {yes},
journal = {Datenbank-Spektrum},
pages = {24-32},
peerreviewed = {unknown},
title = {{Aktuelle} {Aspekte} des {Music} {Information} {Retrieval}},
volume = {6},
year = {2006}
}
@inproceedings{faucris.230325367,
abstract = {Narrowband direction-of-arrival (DOA) estimates are commonly used for source localization, parametric spatial audio coding, and directional filtering. As previously shown, a linear least squares direction estimate can be obtained by minimizing the difference of expected and observed inter-microphone phase differences. In this work, it is shown that phase wrapping induces severe estimation errors especially at frequencies just below spatial aliasing frequencies and in low signal-to-noise ratios. A cost function to mitigate the influence of phase wrapping errors on the DOA estimation is proposed. Even though the proposed cost function is nonlinear, it is shown that one iteration of a gradient descent method with proper initialization provides a large improvement when compared to the linear least squares solution.},
author = {Kabzinski, Tobias and Habets, Emanuël},
booktitle = {European Signal Processing Conference},
date = {2019-09-02/2019-09-06},
doi = {10.23919/EUSIPCO.2019.8902551},
faupublication = {yes},
isbn = {9789082797039},
keywords = {Direction-of-arrival estimation; Microphone arrays; Narrowband; Phase wrapping; Source localization},
note = {CRIS-Team Scopus Importer:2019-12-10},
peerreviewed = {unknown},
publisher = {European Signal Processing Conference, EUSIPCO},
title = {{A} least squares narrowband {DOA} estimator with robustness against phase wrapping},
venue = {A Coruna},
volume = {2019-September},
year = {2019}
}
@misc{faucris.213608692,
author = {Habets, Emanuël},
faupublication = {no},
peerreviewed = {automatic},
title = {{A} literature study on dereverberation in acoustic environments},
year = {2003}
}
@inproceedings{faucris.107514924,
address = {Berlin},
author = {Brandenburg, Karlheinz and Herre, Jürgen},
booktitle = {117th AES Convention},
faupublication = {yes},
keywords = {Tutorial Seminar},
note = {Eingeladene Konferenz-Workshop-Vorträge, Tutorials},
pages = {-},
peerreviewed = {unknown},
publisher = {AES},
series = {Tutorial Seminar},
title = {{All} about {Audio} {Data} {Reduction}},
venue = {Berlin},
year = {2004}
}
@inproceedings{faucris.212121439,
author = {Luis Valero, Maria and Mabande, E. and Habets, Emanuël},
booktitle = {Proc. of the International Workshop on Acoustic Signal Enhancement (IWAENC)},
doi = {10.1109/IWAENC.2018.8521290},
faupublication = {yes},
isbn = {9781538681510},
keywords = {Multi-microphone acoustic echo control; STFT-domain adaptive filters; State space algorithms},
pages = {1--5},
peerreviewed = {unknown},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
title = {{A} low-complexity state-space architecture for multi-microphone acoustic echo control},
year = {2018}
}
@inproceedings{faucris.214659517,
author = {Thiergart, Oliver and Huang, Weilong and Habets, Emanuël},
booktitle = {Proc. IEEE Intl. Conf. on Acoustics, Speech and Signal Processing (ICASSP)},
doi = {10.1109/ICASSP.2016.7471693},
faupublication = {yes},
isbn = {9781479999880},
pages = {340-344},
peerreviewed = {unknown},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
title = {{A} low complexity weighted least squares narrowband {DOA} estimator for arbitrary array geometries},
year = {2016}
}
@inproceedings{faucris.305807763,
author = {Uhle, Christian and Walther, Andreas and Hellmuth, Oliver and Herre, Jürgen},
booktitle = {AES 30th International Conference},
faupublication = {no},
note = {herre{\_}papers{\_}content{\_}based{\_}techniques},
peerreviewed = {unknown},
title = {{Ambience} {Separation} from {Mono} {Recordings} using {Non}-negative {Matrix} {Factorization}},
venue = {Saariskelä},
year = {2007}
}
@article{faucris.308945943,
abstract = {Blind separation of the sounds in an Ambisonic sound scene is a challenging problem, especially when the spatial impression of these sounds needs to be preserved. In this work, we consider Ambisonic-to-Ambisonic separation of reverberant speech mixtures, optionally containing noise. A supervised learning approach is adopted utilizing a transformer-based deep neural network, denoted by AmbiSep. AmbiSep takes mutichannel Ambisonic signals as input and estimates separate multichannel Ambisonic signals for each speaker while preserving their spatial images including reverberation. The GPU memory requirement of AmbiSep during training increases with the number of Ambisonic channels. To overcome this issue, we propose different aggregation methods.The model is trained and evaluated for first-order and second-order Ambisonics using simulated speech mixtures. Experimental results show that the model performs well on clean and noisy reverberant speech mixtures, and also generalizes to mixtures generated with measured Ambisonic impulse responses.},
author = {Herzog, Adrian and Chetupalli, Srikanth Raj and Habets, Emanuël},
doi = {10.1109/TASLP.2023.3297954},
faupublication = {yes},
journal = {IEEE/ACM Transactions on Audio, Speech and Language Processing},
keywords = {Ambisonics; Decoding; Encoding; Memory management; noise reduction; Reverberation; reverberation; Speech processing; speech separation; Training; Transformers; transformers},
note = {CRIS-Team Scopus Importer:2023-08-11},
pages = {1-13},
peerreviewed = {Yes},
title = {{AmbiSep}: {Joint} {Ambisonic}-to-{Ambisonic} {Speech} {Separation} and {Noise} {Reduction}},
year = {2023}
}
@inproceedings{faucris.213637545,
address = {Germany},
author = {Chakrabarty, Soumitro and Thiergart, Oliver and Habets, Emanuël},
booktitle = {Proc. of the ITG Conference on Speech Communication},
faupublication = {yes},
pages = {1--5},
peerreviewed = {unknown},
title = {{A} method to analyze the spatial response of informed spatial filters},
year = {2016}
}
@inproceedings{faucris.224420442,
address = {Kobe, Japan},
author = {Grosche, Peter and Müller, Meinard},
booktitle = {Proceedings of the International Society for Music Information Retrieval Conference (ISMIR)},
faupublication = {yes},
pages = {189-194},
peerreviewed = {unknown},
title = {{A} mid-level representation for capturing dominant tempo and pulse information in music recordings},
year = {2009}
}
@article{faucris.212138444,
author = {Braun, Sebastian and Habets, Emanuël},
doi = {10.1186/s13636-015-0077-2},
faupublication = {yes},
journal = {EURASIP Journal on Audio, Speech, and Music Processing},
keywords = {Diffuse power estimation; Dereverberation; Multichannel Wiener filter},
pages = {1-14},
peerreviewed = {Yes},
title = {{A} multichannel diffuse power estimator for dereverberation in the presence of multiple sources},
volume = {2015},
year = {2015}
}
@inproceedings{faucris.213628170,
address = {Florence, Italy},
author = {Astudillo, R. F. and Braun, Sebastian and Habets, Emanuël},
booktitle = {Proc. of the REVERB Challenge Workshop},
faupublication = {yes},
pages = {1--8},
peerreviewed = {unknown},
title = {{A} multichannel feature compensation approach for robust {ASR} in noisy and reverberant environments},
year = {2014}
}
@article{faucris.224414733,
author = {Müller, Meinard and Clausen, Michael and Konz, Verena and Ewert, Sebastian and Fremerey, Christian},
faupublication = {yes},
journal = {Interdisciplinary Science Reviews},
pages = {138-153},
peerreviewed = {Yes},
title = {{A} {Multimodal} {Way} of {Experiencing} and {Exploring} {Music}},
volume = {35},
year = {2010}
}
@incollection{faucris.318612021,
abstract = {The goal of automatic music segmentation is to calculate boundaries between musical parts or sections that are perceived as semantic entities. Such sections are often characterized by specific musical properties such as instrumentation, dynamics, tempo, or rhythm. Recent data-driven approaches often phrase music segmentation as a binary classification problem, where musical cues for identifying boundaries are learned implicitly. Complementary to such methods, we present in this paper an approach for identifying relevant audio features that explain the presence of musical boundaries. In particular, we describe a multi-objective evolutionary feature selection strategy, which simultaneously optimizes two objectives. In a first setting, we reduce the number of features while maximizing an F-measure. In a second setting, we jointly maximize precision and recall values. Furthermore, we present extensive experiments based on six different feature sets covering different musical aspects. We show that feature selection allows for reducing the overall dimensionality while increasing the segmentation quality compared to full feature sets, with timbre-related features performing best.
method for low delay transform domain general audio coders.
This Frequency Domain Joint Harmonics Prediction (FDJHP)
method operates directly in the Modified Discrete Cosine Transform
(MDCT) domain and can enhance the coding efficiency,
even under very low frequency resolutions. We compare this
new method with state-of-the-art MDCT based methods by
analyzing bitrate savings and by a listening test using test signals
with strong harmonic components. The results indicate that it
outperforms an existing method, which also directly operates
in the frequency domain. Additionally, we show how it can be
combined with the existing techniques into an adaptive system,
where the different methods can complement each other.
Chapters can be read independently and thus serve as building blocks for individually structured courses
Each chapter is complemented with many examples, figures, exercises, and references for further reading
Related Web page includes additional audio-visual material and Python code exampl},
address = {Cham},
author = {Müller, Meinard},
doi = {10.1007/978-3-030-69808-9},
faupublication = {yes},
isbn = {9783030698072},
keywords = {Audio Retrieval; Fourier Analysis; Music Computing; Music Retrieval; Music Structure Analysis; Pattern Recognition; Signal Processing; Sound Computing; Pattern Recognition; Signal; Image and Speech Processing; Information Storage and Retrieval; Fourier Analysis; Computer Appl. in Arts and Humanities; Music},
peerreviewed = {unknown},
publisher = {Springer},
title = {{Fundamentals} of {Music} {Processing}},
year = {2021}
}
@incollection{faucris.116327244,
address = {USA},
author = {Herre, Jürgen and Purnhagen, Heiko},
booktitle = {Prentice Hall IMSC Multimedia Series},
faupublication = {yes},
isbn = {0-13-061621-4},
note = {herre{\_}books{\_}journals},
pages = {487-544},
peerreviewed = {unknown},
publisher = {ISMC},
title = {{General} {Audio} {Coding}},
year = {2002}
}
@article{faucris.262165091,
abstract = {The acoustic intensity vector and energy density are perceptually relevant physical measures of a sound field that can be used in the context of sound field reproduction or acoustic parameter estimation. In this work, weighted spatial averaging of the intensity vector and energy density is investigated, and the results are expressed in terms of the spherical harmonic coefficients of the sound field. Higher-order spherical harmonic coefficients are incorporated by considering radial averaging. This radial averaging is then generalized, yielding the proposed generalized intensity vector and energy density. Direction-of-arrival and diffuseness estimators are constructed based on the generalized intensity vector and energy density. In the evaluation, the proposed parameter estimators are compared to existing state-of-the-art estimators using simulated signals containing directional, diffuse, and sensor-noise components.},
author = {Herzog, Adrian and Habets, Emanuël},
doi = {10.1121/10.0005473},
faupublication = {yes},
journal = {The Journal of the Acoustical Society of America},
note = {CRIS-Team Scopus Importer:2021-07-30},
pages = {294-306},
peerreviewed = {Yes},
title = {{Generalized} intensity vector and energy density in the spherical harmonic domain: {Theory} and applications},
volume = {150},
year = {2021}
}
@article{faucris.252098898,
abstract = {The spatial properties of a noise field can be described by a spatial coherence function. Synthetic multichannel noise signals exhibiting a specific spatial coherence can be generated by properly mixing a set of uncorrelated, possibly non-stationary, signals. The mixing matrix can be obtained by decomposing the spatial coherence matrix. As proposed in a widely used method, the factorization can be performed by means of a Choleski or eigenvalue decomposition. In this work, the limitations of these two methods are discussed and addressed. In particular, specific properties of the mixing matrix are analyzed, namely, the spectral smoothness and the mix balance. The first quantifies the mixing matrix-filters variation across frequency and the second quantifies the number of input signals that contribute to each output signal. Three methods based on the unitary Procrustes solution are proposed to enhance the spectral smoothness, the mix balance, and both properties jointly. A performance evaluation confirms the improvements of the mixing matrix in terms of objective measures. Furthermore, the evaluation results show that the error between the target and the generated coherence is lowered by increasing the spectral smoothness of the mixing matrix. },
author = {Mirabilii, Daniele and Schlecht, Sebastian J. and Habets, Emanuël},
doi = {10.1121/10.0003565},
faupublication = {yes},
journal = {Journal of the Acoustical Society of America},
note = {CRIS-Team Scopus Importer:2021-03-19},
pages = {1425-1433},
peerreviewed = {Yes},
title = {{Generating} coherence-constrained multisensor signals using balanced mixing and spectrally smooth filters},
volume = {149},
year = {2021}
}
@article{faucris.224437731,
author = {Clausen, Michael and Müller, Meinard},
faupublication = {yes},
journal = {Journal of Symbolic Computation},
pages = {137-156},
peerreviewed = {Yes},
title = {{Generating} {Fast} {Fourier} {Transforms} of {Solvable} {Groups}},
volume = {37},
year = {2004}
}
@article{faucris.212159636,
author = {Habets, Emanuël and Cohen, Israel and Gannot, Sharon},
doi = {10.1121/1.2987429},
faupublication = {no},
journal = {Journal of the Acoustical Society of America},
pages = {2911-2917},
peerreviewed = {Yes},
title = {{Generating} nonstationary multisensor signals under a spatial coherence constraint},
volume = {124},
year = {2008}
}
@article{faucris.212159921,
author = {Habets, Emanuël and Gannot, Sharon},
doi = {10.1121/1.2799929},
faupublication = {no},
journal = {Journal of the Acoustical Society of America},
pages = {3464-3470},
peerreviewed = {Yes},
title = {{Generating} sensor signals in isotropic noise fields},
volume = {122},
year = {2007}
}
@inproceedings{faucris.212160219,
address = {Marrakech, Morocco},
author = {Habets, Emanuël and Craciun, Alexandra and Kowalczyk, Konrad},
booktitle = {Proc. European Signal Processing Conf. (EUSIPCO)},
faupublication = {yes},
isbn = {9780992862602},
keywords = {noise reduction; microphone arrays; spatial sound; parametric sound field model; Wiener filter},
pages = {1--5},
peerreviewed = {unknown},
publisher = {European Signal Processing Conference, EUSIPCO},
title = {{Generating} virtual microphone signals in noisy environments},
url = {https://www.scopus.com/record/display.uri?eid=2-s2.0-84901345127&origin=inward},
venue = {Marrakech},
year = {2013}
}
@inproceedings{faucris.212160535,
address = {Edinburgh, Scotland},
author = {Del Gado, G. and Thiergart, Oliver and Weller, T. and Habets, Emanuël},
booktitle = {Proc. of the Joint Workshop on Hands-free Speech Communication and Microphone Arrays},
doi = {10.1109/HSCMA.2011.5942394},
faupublication = {yes},
isbn = {9781457709999},
keywords = {Audio recording; Sound localization; Parameter estimation; Spatial sound},
pages = {185-190},
peerreviewed = {unknown},
publisher = {IEEE},
title = {{Generating} virtual microphone signals using geometrical information gathered by distributed arrays},
venue = {Edinburgh},
year = {2011}
}
@inproceedings{faucris.285003013,
abstract = {A geometrically-motivated method for primary-ambient decomposition is proposed and evaluated in an upmixing application. The method consists of two steps, accommodating a particularly intuitive explanation. The first step consists of signal-adaptive rotations applied on the input stereo scene, which translate the primary sound sources into the center of the rotated scene. The second step applies a center-channel extraction method, based on a simple signal model and optimal in the mean-squared-error sense. The performance is evaluated by using the estimated ambient component to enable surround sound starting from real-world stereo signals. The participants in the reported listening test are asked to adjust the audio scene envelopment and find the audio settings that pleases them the most. The possibility for up-mixing enabled by the proposed method is used extensively, and the user satisfaction is significantly increased compared to the original stereo mix.},
author = {Paulus, Jouni and Torcoli, Matteo},
booktitle = {European Signal Processing Conference},
date = {2022-08-29/2022-09-02},
faupublication = {yes},
isbn = {9789082797091},
keywords = {center-channel extraction; listening test; primary-ambient decomposition; up-mixing},
note = {CRIS-Team Scopus Importer:2022-11-11},
pages = {299-303},
peerreviewed = {unknown},
publisher = {European Signal Processing Conference, EUSIPCO},
title = {{Geometrically}-{Motivated} {Primary}-{Ambient} {Decomposition} {With} {Center}-{Channel} {Extraction}},
venue = {Belgrade},
volume = {2022-August},
year = {2022}
}
@incollection{faucris.224434537,
author = {Röder, Tido and Müller, Meinard},
booktitle = {Bewegungs-Sonification und Musteranalyse im Sport},
editor = {Effenberg AO},
faupublication = {yes},
pages = {24-28},
peerreviewed = {unknown},
publisher = {Cuviller Verlag},
title = {{Geometrische} {Relationen} für die {Bewegungsanalyse}},
year = {2006}
}
@article{faucris.212160874,
author = {Thiergart, Oliver and Del Galdo, Giovanni and Taseska, Maja and Habets, Emanuël},
doi = {10.1109/TASL.2013.2280210},
faupublication = {yes},
journal = {IEEE Transactions on Audio Speech and Language Processing},
keywords = {position estimation; microphone arrays; Spatial sound acquisition; signal-to-diffuse ratio},
pages = {2583-2594},
peerreviewed = {Yes},
title = {{Geometry}-based spatial sound acquisition using distributed microphone arrays},
volume = {21},
year = {2013}
}
@inproceedings{faucris.224416961,
address = {Utrecht, The Netherlands},
author = {Fremerey, Christian and Müller, Meinard and Clausen, Michael},
booktitle = {Proceedings of the International Conference on Music Information Retrieval (ISMIR)},
faupublication = {yes},
pages = {243-248},
peerreviewed = {unknown},
title = {{Handling} {Repeats} and {Jumps} in {Score}-{Performance} {Synchronization}},
year = {2010}
}
@inproceedings{faucris.224421924,
address = {Rotterdam, Netherlands},
author = {Fremerey, Christian and Damm, David and Müller, Meinard and Kurth, Frank and Clausen, Michael},
booktitle = {Proceedings of the International Conference on Acoustics (NAG/DAGA)},
faupublication = {yes},
pages = {350-351},
peerreviewed = {unknown},
title = {{Handling} {Scanned} {Sheet} {Music} and {Audio} {Recordings} in {Digital} {Music} {Libraries}},
year = {2009}
}
@inproceedings{faucris.224389876,
address = {Shanghai, China},
author = {Füg, Richard and Niedermeier, Andreas and Driedger, Jonathan and Disch, Sascha and Müller, Meinard},
booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
faupublication = {yes},
pages = {445-449},
peerreviewed = {unknown},
title = {{Harmonic}-{Percussive}-{Residual} {Sound} {Separation} {Using} the {Structure} {Tensor} on {Spectrograms}},
year = {2016}
}
@inproceedings{faucris.224395359,
address = {Nürnberg, Germany},
author = {Driedger, Jonathan and Müller, Meinard},
booktitle = {Proceedings of the Deutsche Jahrestagung für Akustik ({DAGA})},
faupublication = {yes},
pages = {1421-1424},
peerreviewed = {unknown},
title = {{Harmonisch}-{Perkussiv}-{Rest} {Zerlegung} von {Musiksignalen}},
year = {2015}
}
@inproceedings{faucris.212161206,
author = {Chakrabarty, Soumitro and Pilakeezhu, D. and Habets, Emanuël},
booktitle = {15th International Workshop on Acoustic Signal Enhancement, IWAENC 2016},
doi = {10.1109/IWAENC.2016.7602919},
faupublication = {yes},
isbn = {9781509020072},
keywords = {Head-orientation; Speech enhancement; Single channel; Video-informed},
peerreviewed = {unknown},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
title = {{Head}-orientation compensation with video-informed single channel speech enhancement},
year = {2016}
}
@article{faucris.262430406,
abstract = {In dynamic virtual reality, visual cues and motor actions aid auditory perception. With multimodal integration and auditory adaptation effects, generic head-related transfer functions (HRTFs) may yield no significant disadvantage to individual HRTFs regarding accurate auditory perception. This study compares two individual HRTF sets against a generic HRTF set by way of objective analysis and two subjective experiments. First, auditory-model-based predictions examine the objective deviations in localization cues between the sets. Next, the HRTFs are compared in a static subjective (N = 8) localization experiment. Finally, the localization accuracy, timbre, and overall quality of the HRTF sets are evaluated subjectively (N = 12) in a six-degrees-of-freedom audio-visual virtual environment. The results show statistically significant objective deviations between the sets, but no perceived localization or overall quality differences in the dynamic virtual reality.},
author = {Rummukainen, Olli S. and Robotham, Thomas and Habets, Emanuël},
doi = {10.3390/app11146646},
faupublication = {yes},
journal = {Applied Sciences},
note = {CRIS-Team WoS Importer:2021-08-06},
peerreviewed = {Yes},
title = {{Head}-{Related} {Transfer} {Functions} for {Dynamic} {Listeners} in {Virtual} {Reality}},
volume = {11},
year = {2021}
}
@article{faucris.307870998,
abstract = {Instrument activity detection is a fundamental task in music information retrieval, serving as a basis for many applications, such as music recommendation, music tagging, or remixing. Most published works on this task cover popular music and music for smaller ensembles. In this paper, we embrace orchestral and opera music recordings as a rarely considered scenario for automated instrument activity detection. Orchestral music is particularly challenging since it consists of intricate polyphonic and polytimbral sound mixtures where multiple instruments are playing simultaneously. Orchestral instruments can naturally be arranged in hierarchical taxonomies, according to instrument families. As the main contribution of this paper, we show that a hierarchical classification approach can be used to detect instrument activity in our scenario, even if only few fine-grained, instrument-level annotations are available. We further consider additional loss terms for improving the hierarchical consistency of predictions. For our experiments, we collect a dataset containing 14 hours of orchestral music recordings with aligned instrument activity annotations. Finally, we perform an analysis of the behavior of our proposed approach with regard to potential confounding errors.},
author = {Krause, Michael and Müller, Meinard},
doi = {10.1109/TASLP.2023.3291506},
faupublication = {yes},
journal = {IEEE/ACM Transactions on Audio, Speech and Language Processing},
keywords = {Analytical models; Annotations; hierarchical classification; instrument activity detection; Instruments; Music; music information retrieval; music processing; orchestral music; Recording; Speech processing; Task analysis},
note = {CRIS-Team Scopus Importer:2023-07-21},
pages = {1-12},
peerreviewed = {Yes},
title = {{Hierarchical} {Classification} for {Instrument} {Activity} {Detection} in {Orchestral} {Music} {Recordings}},
year = {2023}
}
@inproceedings{faucris.287474625,
address = {NEW YORK},
author = {Krause, Michael and Müller, Meinard},
booktitle = {2022 IEEE INTERNATIONAL CONFERENCE ON ACOUSTICS, SPEECH AND SIGNAL PROCESSING (ICASSP)},
doi = {10.1109/ICASSP43922.2022.9747690},
faupublication = {yes},
month = {Jan},
note = {CRIS-Team WoS Importer:2023-01-13},
pages = {406-410},
peerreviewed = {unknown},
publisher = {IEEE},
title = {{HIERARCHICAL} {CLASSIFICATION} {OF} {SINGING} {ACTIVITY}, {GENDER}, {AND} {TYPE} {IN} {COMPLEX} {MUSIC} {RECORDINGS}},
venue = {Singapore},
year = {2022}
}
@inproceedings{faucris.224419216,
address = {Taipei, Taiwan},
author = {Ewert, Sebastian and Müller, Meinard and Grosche, Peter},
booktitle = {Proceedings of IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
faupublication = {yes},
isbn = {978-1-4244-2354-5},
pages = {1869-1872},
peerreviewed = {unknown},
title = {{High} {Resolution} {Audio} {Synchronization} {Using} {Chroma} {Onset} {Features}},
year = {2009}
}
@inproceedings{faucris.318843358,
author = {Tamer, Nazif Can and Özer, Yigitcan and Müller, Meinard and Serra, Xavier},
booktitle = {Proceedings of the International Society for Music Information Retrieval Conference (ISMIR)},
faupublication = {yes},
pages = {223-230},
peerreviewed = {Yes},
title = {{High}-{Resolution} {Violin} {Transcription} {Using} {Weak} {Labels}},
year = {2023}
}
@inproceedings{faucris.320121571,
abstract = {A descriptive transcription of a violin performance requires detecting not only the notes but also the fine-grained pitch variations, such as vibrato. Most existing deep learning methods for music transcription do not capture these variations and often need frame-level annotations, which are scarce for the violin. In this paper, we propose a novel method for high-resolution violin transcription that can leverage piece-level weak labels for training. Our conformer-based model works on the raw audio waveform and transcribes violin notes and their corresponding pitch deviations with 5.8 ms frame resolution and 10-cent frequency resolution. We demonstrate that our method (1) outperforms generic systems in the proxy tasks of violin transcription and pitch estimation, and (2) can automatically generate new training labels by aligning its feature representations with unseen scores. We share our model along with 34 hours of score-aligned solo violin performance dataset, notably including the 24 Paganini Caprices.
Ongoing digitization efforts lead to vast amounts of music data, e.g., audio and video recordings, symbolically encoded scores, or graphical sheet music. Accessing this data in a convenient way requires flexible retrieval strategies. One access paradigm is known as “query by example,” where a short music excerpt in a specific representation is given as a query. The task is to automatically retrieve documents from a music database that are similar to the query in certain parts or aspects. This thesis addresses two different cross-version retrieval scenarios of Western classical music, where the aim is to find the database’s audio recordings that are based on the same musical work as the query. Depending on the respective scenario, one requires task-specific audio representations to compare the query and the database documents. Various approaches for learning such audio representations with deep neural networks are proposed, leading to improvements in the efficiency of the search and the quality of the retrieval results.
In this paper we use foot-mounted inertial measurement units (IMU) as a mobile solution to measure the gait of 21 HSP patients while performing a 4 by 10 m walk at self-selected pace. Two algorithms common to other gait analysis solutions, the hidden Markov model (HMM) and dynamic time warping (DTW), were applied to these signals in order to investigate their effectiveness when faced with the heterogeneous nature and range of foot strike techniques of HSP gait, sometimes even lacking a heel strike. Using a nested cross validation for parameter choice and validation, the HMM was found to be superior for segmentation purposes with a mean segmentation error of 0.10 ± 0.05 s. Stride segmentation of such a diverse dataset is the first step towards creating a clinically relevant system which could assist physicians working with HSP patients by providing objective, automated gait parameters. To the best of the authors’ knowledge, this is the first paper to investigate solutions for mobile gait analysis of patients affected by HSPs. Ultimately, automated, mobile gait analysis of HSP patients would allow ongoing and long term monitoring, providing useful insights into this orphan diseas},
author = {Martindale, Christine and Strauss, Martin and Gaßner, Heiko and List, Julia and Müller, Meinard and Klucken, Jochen and Kohl, Zacharias and Eskofier, Björn},
booktitle = {Engineering in Medicine and Biology Society (EMBC), 2017 39th Annual International Conference of the IEEE},
date = {2017-07-11/2017-07-15},
doi = {10.1109/EMBC.2017.8037062},
faupublication = {yes},
isbn = {978-1-5090-2809-2},
keywords = {Hidden Markov models; segmentation; gait analysis; hereditary spastic paraplegia},
peerreviewed = {Yes},
publisher = {IEEE},
title = {{Segmentation} of gait sequences using inertial sensor data in hereditary spastic paraplegia},
url = {https://www.mad.tf.fau.de/files/2018/09/embc2017{\_}martindale.pdf},
venue = {Jeju Island, South Korea},
year = {2017}
}
@incollection{faucris.318900840,
author = {Klauk, Stephanie and Kleinertz, Reiner and Weiß, Christof and Müller, Meinard},
booktitle = {Jahrbuch 2017 des Staatlichen Instituts für Musikforschung (SIM) — Preußischer Kulturbesitz},
editor = {Simone Hohmaier},
faupublication = {yes},
isbn = {978-3-7957-1005-7},
pages = {271-300},
peerreviewed = {Yes},
publisher = {Schott Music},
title = {{Seitensatz} versus {Mittelsatz}: {Expositionen} in {Beethovens} frühen {Klaviersonaten} zwischen zeitgenössischer {Theorie} und computergestützter {Analyse}},
year = {2021}
}
@article{faucris.212185590,
author = {Rummukainen, Olli and Schlecht, Sebastian and Habets, Emanuël},
doi = {10.1121/1.5064957},
faupublication = {yes},
journal = {Journal of the Acoustical Society of America},
pages = {EL340-EL345},
peerreviewed = {Yes},
title = {{Self}-translation induced minimum audible angle},
volume = {144},
year = {2018}
}
@inproceedings{faucris.112430164,
abstract = {Workshop: Audio gets smart - the what and why of semantic audio analysi},
address = {New York},
author = {Herre, Jürgen},
booktitle = {115th AES Convention},
faupublication = {no},
note = {Eingeladene Konferenz-Workshop-Vorträge, Tutorials},
pages = {-},
peerreviewed = {unknown},
publisher = {AES},
title = {{Semantic} {Audio} {Analysis} & {Metadata} {Standards}},
venue = {New York},
year = {2003}
}
@inproceedings{faucris.309471893,
author = {Adami, Alexander and Herre, Jürgen},
booktitle = {AES International Conference on Semantic Audio},
faupublication = {yes},
keywords = {Demonstration},
note = {Vorträge und Veröffentlichungen auf Konferenzen{\_}Applaus},
peerreviewed = {unknown},
series = {Demonstration},
title = {{Semantic} {Decomposition} of {Applause}-{Like} {Signals} and {Applications}},
venue = {Erlangen},
year = {2017}
}
@inproceedings{faucris.309659574,
author = {Herre, Jürgen},
booktitle = {AES International Conference on Semantic Audio},
faupublication = {yes},
note = {Sitzungsleitungen auf Konferenzen usw.},
peerreviewed = {unknown},
title = {{Session} "{Audio} {Source} {Separation}"},
venue = {Erlangen},
year = {2017}
}
@inproceedings{faucris.309659044,
author = {Herre, Jürgen},
booktitle = {144th AES Convention},
faupublication = {yes},
note = {Sitzungsleitungen auf Konferenzen usw.},
peerreviewed = {unknown},
title = {{Session} "{P10} - {Audio} {Coding}, {Analysis}, and {Synthesis}"},
venue = {Milan},
year = {2018}
}
@inproceedings{faucris.224420687,
address = {Kobe, Japan},
author = {Fremerey, Christian and Clausen, Michael and Ewert, Sebastian and Müller, Meinard},
booktitle = {Proceedings of the International Conference on Music Information Retrieval (ISMIR)},
faupublication = {yes},
pages = {645-650},
peerreviewed = {unknown},
title = {{Sheet} {Music}-{Audio} {Identification}},
year = {2009}
}
@inproceedings{faucris.114632364,
address = {Geneva},
author = {Brandenburg, Karlheinz and Grill, Bernhard and Herre, Jürgen and Väänänen, R},
booktitle = {IEEE International Symposium on Circuits and Systems},
faupublication = {no},
note = {Eingeladene Vorträge auf Konferenzen etc.},
pages = {-},
peerreviewed = {unknown},
publisher = {IEEE},
title = {{Short} {Course} "{MPEG}-4 {Audio}"},
venue = {Geneva},
year = {2000}
}
@incollection{faucris.108824144,
address = {Brandenburg},
author = {Neugebauer, Christian and Herre, Jürgen and Brandenburg, Karlheinz},
booktitle = {Handbuch der Musikwirtschaft},
faupublication = {no},
isbn = {3-7808-0188-4},
note = {herre{\_}books{\_}journals},
pages = {144-154},
peerreviewed = {unknown},
publisher = {Josef Keller Verlag},
title = {{Sicherheitstechnologien} für den elektronischen {Musikvertrieb}},
year = {2003}
}
@inproceedings{faucris.212185873,
address = {Tokyo, Japan},
author = {Schlecht, Sebastian and Habets, Emanuël},
booktitle = {Proc. of the AES International Conference on Spatial Reproduction-Aesthetics and Science},
faupublication = {yes},
isbn = {9781510870406},
pages = {370-379},
peerreviewed = {unknown},
publisher = {Audio Engineering Society},
title = {{Sign}-agnostic matrix design for spatial artificial reverberation with feedback delay networks},
url = {https://www.scopus.com/record/display.uri?eid=2-s2.0-85056807714&origin=inward},
year = {2018}
}
@inproceedings{faucris.256926832,
abstract = {We refer to direction-of-arrivals (DOAs) estimation of a user-defined subset of directional (desired) sound sources as signal-aware DOA estimation. Source selection, thereby, can be achieved with time-frequency masks to apply attention to TF bins dominated by desired sources. With deep neural networks (DNNs), another option is to train the DNN to estimate the DOAs only of specific classes, like speech, and disregard the DOAs of other classes. Consequently, changing the desired classes requires retraining the DNN. Also, the mask-based approaches are trained for sources known prior to DNN training. To obtain a flexible signal-aware DOA estimator, we propose to use binary mask attention with a DNN for multi-source DOA estimation trained with artificial noise. The desired sources are determined via binary masks, which allows a redefinition by changing the masks. Consequently, the DOA estimator is independent of the desired sources. We experiment with attention in form of oracle and estimated binary masks.},
author = {Mack, Wolfgang and Bharadwaj, Ullas and Chakrabarty, Soumitro and Habets, Emanuël},
booktitle = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings},
date = {2020-05-04/2020-05-08},
doi = {10.1109/ICASSP40776.2020.9053658},
faupublication = {yes},
isbn = {9781509066315},
keywords = {Attention; DOA; Micro-phone Arrays; Signal-Aware},
note = {CRIS-Team Scopus Importer:2021-04-30},
pages = {4930-4934},
peerreviewed = {unknown},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
title = {{Signal}-{Aware} {Broadband} {DOA} {Estimation} {Using} {Attention} {Mechanisms}},
venue = {Barcelona, ESP},
volume = {2020-May},
year = {2020}
}
@article{faucris.270671262,
abstract = {The direction-of-arrival (DOA) of sound sources is an essential acoustic parameter used, e.g., for multi-channel speech enhancement or source tracking. Complex acoustic scenarios consisting of sources-of-interest, interfering sources, reverberation, and noise make the estimation of the DOAs corresponding to the sources-of-interest a challenging task. Recently proposed attention mechanisms allow DOA estimators to focus on the sources-of-interest and disregard interference and noise, i.e., they are signal-aware. The attention is typically obtained by a deep neural network (DNN) from a short-time Fourier transform (STFT) based representation of a single microphone signal. Subsequently, attention has been applied as binary or ratio weighting to STFT-based microphone signal representations to reduce the impact of frequency bins dominated by noise, interference, or reverberation. The impact of attention on DOA estimators and different training strategies for attention and DOA DNNs are not yet studied in depth. In this paper, we evaluate systems consisting of different DNNs and signal processing-based methods for DOA estimation when attention is applied. Additionally, we propose training strategies for attention-based DOA estimation optimized via a DOA objective, i.e., end-to-end. The evaluation of the proposed and the baseline systems is performed using data generated with simulated and measured room impulse responses of a uniform-linear microphone array under various acoustic conditions, like reverberation times, noise, and source array distances. The data contains a single source-of-interest, noise, and directional interference. The best-performing systems are also evaluated using measured data. Our experiments show that DNNs used for DOA estimation are biased to the spectral source characteristics and the spectral attention distribution used during training (e.g., spectrally flat/sparse). We also show that this bias in the DOA estimator can be avoided if signal-processing methods are used in combination with attention. Overall, DOA estimation using attention in combination with signal-processing methods exhibits a far lower computational complexity than a fully DNN-based system; however, it yields comparable results.},
author = {Mack, Wolfgang and Wechsler, Julian and Habets, Emanuël},
doi = {10.1016/j.csl.2022.101363},
faupublication = {yes},
journal = {Computer Speech and Language},
keywords = {Attention; Deep learning; Direction-of-arrival; Signal-dependent},
note = {CRIS-Team Scopus Importer:2022-03-11},
peerreviewed = {unknown},
title = {{Signal}-aware direction-of-arrival estimation using attention mechanisms},
volume = {75},
year = {2022}
}
@inproceedings{faucris.212186169,
address = {Florence, Italy},
author = {Luis Valero, Maria and Mabande, E. and Habets, Emanuël},
booktitle = {Proc. IEEE Intl. Conf. on Acoustics, Speech and Signal Processing (ICASSP)},
doi = {10.1109/ICASSP.2014.6854738},
faupublication = {yes},
isbn = {9781479928927},
keywords = {Late Residual Echo Estimation; Acoustic Echo Cancellation; Late Reverberation Estimation},
pages = {5914-5918},
peerreviewed = {unknown},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
title = {{Signal}-based late residual echo spectral variance estimation},
venue = {Florence},
year = {2014}
}
@article{faucris.212186473,
author = {Naylor, P. A. and Gaubitch, Nikolay and Habets, Emanuël},
doi = {10.1155/2010/127513},
faupublication = {no},
journal = {Journal of Electrical and Computer Engineering },
peerreviewed = {unknown},
title = {{Signal}-based performance evaluation of dereverberation algorithms},
year = {2010}
}
@inproceedings{faucris.273565529,
address = {KESSARIANI},
author = {Herzog, Adrian and Habets, Emanuël},
booktitle = {29TH EUROPEAN SIGNAL PROCESSING CONFERENCE (EUSIPCO 2021)},
doi = {10.23919/eusipco54536.2021.9616236},
faupublication = {yes},
month = {Jan},
note = {CRIS-Team WoS Importer:2022-04-22},
pages = {96-100},
peerreviewed = {unknown},
publisher = {EUROPEAN ASSOC SIGNAL SPEECH & IMAGE PROCESSING-EURASIP},
title = {{Signal}-{Dependent} {Mixing} for {Direction}-{Preserving} {Multichannel} {Noise} {Reduction}},
venue = {, ELECTR NETWORK},
year = {2021}
}
@incollection{faucris.256929304,
abstract = {The process of combining signals acquired by a microphone array in order to ‘focus’ on a signal in a specific direction is known as beamforming or spatial filtering. This chapter considers signal-independent (fixed) beamformers, controlled by weights only dependent on the direction of arrival of the source to be extracted, and which do not otherwise depend on the desired signal. Because the weights of these beamformers are given by simple expressions, they present the advantages of being straightforward to implement and of having low computational complexity.},
address = {Cham},
author = {Jarrett, Daniel P. and Habets, Emanuël and Naylor, Patrick A.},
booktitle = {Theory and Applications of Spherical Microphone Array Processing},
doi = {10.1007/978-3-319-42211-4{\_}6},
editor = {Daniel P. Jarrett, Emanuël A.P. Habets, Patrick A. Naylor},
faupublication = {yes},
isbn = {978-3-319-82525-0},
keywords = {Beam Pattern; Maximum Directivity; Mode Strength; Open Sphere; Steering Direction},
note = {CRIS-Team Scopus Importer:2021-04-30},
pages = {93-112},
peerreviewed = {unknown},
publisher = {Springer Science and Business Media B.V.},
series = {Springer Topics in Signal Processing},
title = {{Signal}-independent array processing},
volume = {9},
year = {2017}
}
@article{faucris.224408826,
author = {Müller, Meinard and Ellis, Daniel P. W. and Klapuri, Anssi and Gaël, Richard},
doi = {10.1109/JSTSP.2011.2112333},
faupublication = {yes},
journal = {IEEE Journal of Selected Topics in Signal Processing},
pages = {1088-1110},
peerreviewed = {Yes},
title = {{Signal} {Processing} for {Music} {Analysis}},
volume = {5},
year = {2011}
}
@inproceedings{faucris.212186754,
address = {Kyoto, Japan},
author = {Thiergart, Oliver and Del Galdo, G. and Habets, Emanuël},
booktitle = {Proc. IEEE Intl. Conf. on Acoustics, Speech and Signal Processing (ICASSP)},
doi = {10.1109/ICASSP.2012.6287878},
faupublication = {yes},
isbn = {9781467300469},
keywords = {Array signal processing; spatial coherence; signal-to-reverberation ratio},
pages = {309-312},
peerreviewed = {unknown},
publisher = {IEEE},
title = {{Signal}-to-reverberant ratio estimation based on the complex spatial coherence between omnidirectional microphones},
venue = {Kyoto},
year = {2012}
}
@inproceedings{faucris.212187079,
author = {Mirabilii, Daniele and Habets, Emanuël},
booktitle = {Proc. of the International Workshop on Acoustic Signal Enhancement (IWAENC)},
doi = {10.1109/IWAENC.2018.8521302},
faupublication = {yes},
isbn = {9781538681510},
keywords = {Multi-channel; Corcos model; Wind noise},
pages = {560-564},
peerreviewed = {unknown},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
title = {{Simulating} multi-channel wind noise based on the {Corcos} model},
year = {2018}
}
@inproceedings{faucris.212187798,
address = {Prague, Czech Republic},
author = {Jarrett, Daniel Phillip and Habets, Emanuël and Thomas, M. R. P. and Naylor, P. A.},
booktitle = {Proc. IEEE Intl. Conf. on Acoustics, Speech and Signal Processing (ICASSP)},
doi = {10.1109/ICASSP.2011.5946345},
faupublication = {yes},
isbn = {9781457705397},
keywords = {reverberation; Image method; spherical microphone arrays; room acoustics},
pages = {129-132},
peerreviewed = {unknown},
publisher = {IEEE},
title = {{Simulating} room impulse responses for spherical microphone arrays},
venue = {Prague},
year = {2011}
}
@inproceedings{faucris.285697207,
abstract = {Generating noise samples is crucial in developing and testing noise reduction algorithms or training deep learning models. This work proposes a wind noise generation model with airflow speed-dependent features. A linear predictive analysis of wind noise measured in a wind tunnel at different flow velocities was carried out. This analysis showed that temporal and spectral features depend on the flow speed. The prediction residual's statistics and the filter coefficients are first extracted and then modeled based on the flow speed. The obtained models are then combined to synthetically generate wind noise given a time-varying flow speed profile as input, in contrast to an existing framework where temporal and spectral features were assumed speed-invariant. A subjective evaluation is carried out to assess the perceptual authenticity of the generated noise compared to the existing method.},
author = {Mirabilii, Daniele and Lodermeyer, Alexander and Czwielong, Felix and Becker, Stefan and Habets, Emanuël},
booktitle = {International Workshop on Acoustic Signal Enhancement, IWAENC 2022 - Proceedings},
date = {2022-09-05/2022-09-08},
doi = {10.1109/IWAENC53105.2022.9914785},
faupublication = {yes},
isbn = {9781665468671},
keywords = {analysis; synthesis; Wind noise},
note = {CRIS-Team Scopus Importer:2022-11-25},
peerreviewed = {unknown},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
title = {{Simulating} {Wind} {Noise} with {Airflow} {Speed}-{Dependent} {Characteristics}},
venue = {Bamberg, DEU},
year = {2022}
}
@article{faucris.264563707,
abstract = {Time difference of arrival (TDOA) based indoor ultrasound localization systems are prone to multiple disruptions and demand reliable, and resilient position accuracy during operation. In this challenging context, a missing link to evaluate the performance of such systems is a simulation approach to test their robustness in the presence of disruptions. This approach cannot only replace experiments in early phases of development but could also be used to study susceptibility, robustness, response, and recovery in case of disruptions. The paper presents a simulation framework for a TDOA-based indoor ultrasound localization system and ways to introduce different types of disrup-tions. This framework can be used to test the performance of TDOA-based localization algorithms in the presence of disruptions. Resilience quantification results are presented for representative disruptions. Based on these quantities, it is found that localization with arc-tangent cost function is approximately 30% more resilient than the linear cost function. The simulation approach is shown to apply to resilience engineering and can be used to increase the efficiency and quality of indoor localization methods.},
author = {Jain, Aishvarya Kumar and Schott, Dominik Jan and Scheithauer, Hermann and Häring, Ivo and Höflinger, Fabian and Fischer, Georg and Habets, Emanuël and Gelhausen, Patrick and Schindelhauer, Christian and Rupitsch, Stefan Johann},
doi = {10.3390/s21196332},
faupublication = {yes},
journal = {Sensors},
keywords = {Cross correlation; Disruption; Indoor localization; Localization accuracy; Loss function; Resilience engineering; Simulation; Technical resilience; Time difference of arrival; Ultrasound},
note = {CRIS-Team Scopus Importer:2021-10-01},
peerreviewed = {Yes},
title = {{Simulation}-based resilience quantification of an indoor ultrasound localization system in the presence of disruptions},
volume = {21},
year = {2021}
}
@article{faucris.259585130,
abstract = {Automatically detecting the presence of singing in music audio recordings is a central task within music information retrieval. While modern machine-learning systems produce high-quality results on this task, the reported experiments are usually limited to popular music and the trained systems often overfit to confounding factors. In this paper, we aim to gain a deeper understanding of such machine-learning methods and investigate their robustness in a challenging opera scenario. To this end, we compare two state-of-the-art methods for singing voice detection based on supervised learning: A traditional approach relying on hand-crafted features with a random forest classifier, as well as a deep-learning approach relying on convolutional neural networks. To evaluate these algorithms, we make use of a cross-version dataset comprising 16 recorded performances (versions) of Richard Wagner's four-opera cycle Der Ring des Nibelungen. This scenario allows us to systematically investigate generalization to unseen versions, musical works, or both. In particular, we study the trained systems' robustness depending on the acoustic and musical variety, as well as the overall size of the training dataset. Our experiments show that both systems can robustly detect singing voice in opera recordings even when trained on relatively small datasets with little variety.},
author = {Krause, Michael and Müller, Meinard and Weiß, Christof},
doi = {10.3390/electronics10101214},
faupublication = {yes},
journal = {Electronics},
note = {CRIS-Team WoS Importer:2021-06-04},
peerreviewed = {Yes},
title = {{Singing} {Voice} {Detection} in {Opera} {Recordings}: {A} {Case} {Study} on {Robustness} and {Generalization}},
volume = {10},
year = {2021}
}
@misc{faucris.213610656,
author = {Habets, Emanuël},
faupublication = {no},
peerreviewed = {automatic},
title = {{Single}- and multi-microphone speech dereverberation using spectral enhancement},
url = {http://alexandria.tue.nl/extra2/200710970.pdf},
year = {2007}
}
@inproceedings{faucris.255662799,
abstract = {Acoustic parameters, like the direct-to-reverberation ratio (DRR), can be used in audio processing algorithms to perform, e.g., dereverberation or in audio augmented reality. Often, the DRR is not available and has to be estimated blindly from recorded audio signals. State-of-the-art DRR estimation is achieved by deep neural networks (DNNs), which directly map a feature representation of the acquired signals to the DRR. Motivated by the equality of the signal-to-reverberation ratio and the (channel-based) DRR under certain conditions, we formulate single-channel DRR estimation as an extraction task of two signal components from the recorded audio. The DRR can be obtained by inserting the estimated signals in the definition of the DRR. The extraction is performed using time-frequency masks. The masks are estimated by a DNN trained end-to-end to minimize the mean-squared error between the estimated and the oracle DRR. We conduct experiments with different preprocessing and mask estimation schemes. The proposed method outperforms state-of-the-art single- and multi-channel methods on the ACE challenge data corpus.},
author = {Mack, Wolfgang and Deng, Shuwen and Habets, Emanuël},
booktitle = {Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH},
date = {2020-10-25/2020-10-29},
doi = {10.21437/Interspeech.2020-2171},
faupublication = {yes},
keywords = {ACE challenge; Acoustic parameter; Deep learning; Direct-to-reverberation ratio (DRR) estimation; Time-frequency mask},
note = {CRIS-Team Scopus Importer:2021-04-19},
pages = {5066-5070},
peerreviewed = {unknown},
publisher = {International Speech Communication Association},
title = {{Single}-channel blind direct-to-reverberation ratio estimation using masking},
venue = {Shanghai, CHN},
volume = {2020-October},
year = {2020}
}
@inproceedings{faucris.212118008,
author = {Mack, Wolfgang and Chakrabarty, Soumitro and Stöter, Fabian-Robert and Braun, Sebastian and Edler, Bernd and Habets, Emanuël},
booktitle = {Proc. Interspeech Conf.},
doi = {10.21437/Interspeech.2018-1296},
faupublication = {yes},
pages = {1314--1318},
peerreviewed = {unknown},
title = {{Single}-channel dereverberation using direct {MMSE} optimization and bidirectional {LSTM} networks},
year = {2018}
}
@inproceedings{faucris.213608935,
address = {Veldhoven, The Netherlands},
author = {Habets, Emanuël},
booktitle = {Proc. Workshop Circuits, Systems and Signal Processing (ProRISC)},
faupublication = {no},
pages = {250--254},
peerreviewed = {unknown},
title = {{Single}-channel speech dereverberation based on spectral subtraction},
year = {2004}
}
@inproceedings{faucris.212188582,
address = {Florence, Italy},
author = {Momeni Hojjatabadi, Hajar and Habets, Emanuël and Abutalebi, Hamid Reza},
booktitle = {Proc. IEEE Intl. Conf. on Acoustics, Speech and Signal Processing (ICASSP)},
doi = {10.1109/ICASSP.2014.6854131},
faupublication = {yes},
isbn = {9781479928927},
keywords = {inter-band correlations; speech presence probability; inter-frame correlations},
pages = {2903-2907},
peerreviewed = {unknown},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
title = {{Single}-channel speech presence probability estimation using inter-frame and inter-band correlations},
venue = {Florence},
year = {2014}
}
@inproceedings{faucris.212118737,
author = {Plinge, Axel and Schlecht, Sebastian and Thiergart, Oliver and Robotham, Thomas and Rummukainen, Olli and Habets, Emanuël},
booktitle = {Proc. of the AES International Conference on Audio for Virtual and Augmented Reality},
faupublication = {yes},
peerreviewed = {unknown},
title = {{Six}-degrees-of-freedom binaural audio reproduction of first-order ambisonics with distance information},
year = {2018}
}
@inproceedings{faucris.224398891,
address = {London, UK},
author = {Müller, Meinard and Jiang, Nanzhu and Grohganz, Harald},
booktitle = {Proceedings of the 53rd AES Conference on Semantic Audio},
faupublication = {yes},
peerreviewed = {unknown},
title = {{SM} {Toolbox}: {MATLAB} implementations for computing and enhancing similarity matrices},
year = {2014}
}
@inproceedings{faucris.318646351,
abstract = {Many tasks in music information retrieval (MIR) involve weakly aligned data, where exact temporal correspondences are unknown. The connectionist temporal classification (CTC) loss is a standard technique to learn feature representations based on weakly aligned training data. However, CTC is limited to discrete-valued target sequences and can be difficult to extend to multi-label problems. In this article, we show how soft dynamic time warping (SoftDTW), a differentiable variant of classical DTW, can be used as an alternative to CTC. Using multi-pitch estimation as an example scenario, we show that SoftDTW yields results on par with a state-of-the-art multi-label extension of CTC. In addition to being more elegant in terms of its algorithmic formulation, SoftDTW naturally extends to real-valued target sequences.},
author = {Krause, Michael and Weiß, Christof and Müller, Meinard},
booktitle = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings},
date = {2023-06-04/2023-06-10},
doi = {10.1109/ICASSP49357.2023.10095907},
faupublication = {yes},
isbn = {9781728163277},
keywords = {dynamic time warping; multi-pitch estimation; music information retrieval; music processing; music transcription},
note = {Created from Fastlane, Scopus look-up},
peerreviewed = {unknown},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
title = {{Soft} {Dynamic} {Time} {Warping} for {Multi}-{Pitch} {Estimation} and {Beyond}},
venue = {Rhodes Island},
volume = {2023-June},
year = {2023}
}
@inproceedings{faucris.318554622,
address = {Seoul, Korea},
author = {Zeitler, Johannes and Krause, Michael and Müller, Meinard},
booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
faupublication = {yes},
peerreviewed = {Yes},
title = {{Soft} {Dynamic} {Time} {Warping} with {Variable} {Step} {Weights}},
year = {2024}
}
@inproceedings{faucris.212189204,
address = {New Paltz, USA},
author = {Kowalczyk, Konrad and Thiergart, Oliver and Craciun, Alexandra and Habets, Emanuël},
booktitle = {Proc. IEEE Workshop on Applications of Signal Processing to Audio and Acoustics},
doi = {10.1109/WASPAA.2013.6701869},
faupublication = {yes},
isbn = {9781479909728},
keywords = {spatial filtering; microphone array processing; Wiener filtering; spatial sound recording},
pages = {1--4},
peerreviewed = {unknown},
publisher = {IEEE},
title = {{Sound} acquisition in noisy and reverberant environments using virtual microphones},
venue = {New Paltz, NY},
year = {2013}
}
@inproceedings{faucris.212189543,
address = {Aachen, Germany},
author = {Thiergart, Oliver and Habets, Emanuël},
booktitle = {Proc. Intl. Workshop Acoust. Echo Noise Control (IWAENC)},
faupublication = {yes},
isbn = {9783800734511},
keywords = {Spatial audio coding; Parametric spatial audio processing; W-disjoint orthogonality},
pages = {1--4},
peerreviewed = {unknown},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
title = {{Sound} field model violations in parametric spatial sound processing},
url = {https://www.scopus.com/record/display.uri?eid=2-s2.0-84957702408&origin=inward},
year = {2012}
}
@inproceedings{faucris.213642432,
address = {Grenoble, France},
author = {Schwartz, Ofer and Braun, Sebastian and Gannot, Sharon and Habets, Emanuël},
booktitle = {Proc. International Conference on Latent Variable Analysis and Signal Separation},
faupublication = {yes},
pages = {182--191},
peerreviewed = {unknown},
title = {{Source} separation, dereverberation and noise reduction using {LCMV} beamformer and postfilter},
year = {2017}
}
@article{faucris.242571195,
abstract = {Dialogue Enhancement (DE) is one of the most promising applications of user interactivity enabled by object-based audio broadcasting. DE allows personalization of the relative level of dialogue for intelligibility or aesthetic reasons. This paper discusses the implementation of DE in object-based audio transport with MPEG-H, with a special focus on source separation methods enabling DE also for legacy content without original objects available. The user-benefit of DE is assessed using the Adjustment/Satisfaction Test methodology. The test results demonstrate the need for an individually adjustable dialogue level because of highly-varying personal preferences. The test also investigates the subjective quality penalty from using source separation for obtaining the objects. The results show that even an imperfect separation result can successfully enable DE leading to increased end-user satisfaction.},
author = {Paulus, Jouni and Torcoli, Matted and Uhle, Christian and Herre, Jürgen and Disch, Sascha and Fuchs, Harald},
doi = {10.17743/jaes.2019.0032},
faupublication = {yes},
journal = {Journal of the Audio Engineering Society},
note = {herre{\_}books{\_}journals},
pages = {510-521},
peerreviewed = {Yes},
title = {{Source} separation for enabling dialogue enhancement in object-based broadcast with {MPEG}-{H}},
volume = {67},
year = {2019}
}
@article{faucris.318569464,
abstract = {In this work, we address the novel and rarely considered source separation task of decomposing piano concerto recordings into separate piano and orchestral tracks. Being a genre written for a pianist typically accompanied by an ensemble or orchestra, piano concertos often involve an intricate interplay of the piano and the entire orchestra, leading to high spectro–temporal correlations between the constituent instruments. Moreover, in the case of piano concertos, the lack of multi-track data for training constitutes another challenge in view of data-driven source separation approaches. As a basis for our work, we adapt existing deep learning (DL) techniques, mainly used for the separation of popular music recordings. In particular, we investigate spectrogram- and waveform-based approaches as well as hybrid models operating in both spectrogram and waveform domains. As a main contribution, we introduce a musically motivated data augmentation approach for training based on artificially generated samples. Furthermore, we systematically investigate the effects of various augmentation techniques for DL-based models. For our experiments, we use a recently published, open-source dataset of multi-track piano concerto recordings. Our main findings demonstrate that the best source separation performance is achieved by a hybrid model when combining all augmentation techniques.