@inproceedings{d128ebf2933a4dbeb9bd120634ebc43e,
title = "Leveraging Category Information for Single-Frame Visual Sound Source Separation",
abstract = "Visual sound source separation aims at identifying sound components from a given sound mixture with the presence of visual cues. Prior works have demonstrated impressive results, but with the expense of large multi-stage architectures and complex data representations (e.g. optical flow trajectories). In contrast, we study simple yet efficient models for visual sound separation using only a single video frame. Furthermore, our models are able to exploit the information of the sound source category in the separation process. To this end, we propose two models where we assume that i) the category labels are available at the training time, or ii) we know if the training sample pairs are from the same or different category. The experiments with the MUSIC dataset show that our model obtains comparable or better performance compared to several recent baseline methods. The code is available at https://github.com/ly-zhu/Leveraging-Category-Information-for-Single-Frame-Visual-Sound-Source-Separation. ",
keywords = "attention mechanism, self-supervised learning, sound source localization, visual sound separation",
author = "Lingyu Zhu and Esa Rahtu",
note = "JUFOID=71968 Funding Information: Acknowledgement This work is supported by the Academy of Finland (projects 327910 & 324346). Publisher Copyright: {\textcopyright} 2021 IEEE.; European Workshop on Visual Information Processing ; Conference date: 23-06-2021 Through 25-06-2021",
year = "2021",
month = jul,
day = "20",
doi = "10.1109/EUVIP50544.2021.9484036",
language = "English",
isbn = "9781665432313",
series = "European Workshop on Visual Information Processing",
publisher = "IEEE",
editor = "A. Beghdadi and Cheikh, {F. Alaya} and J.M.R.S. Tavares and A. Mokraoui and G. Valenzise and L. Oudre and M.A. Qureshi",
booktitle = "Proceedings of the 2021 9th European Workshop on Visual Information Processing, EUVIP 2021",
address = "United States",
}