@inproceedings{9ec28ab16082486a98d435f044b49137,
title = "WaveTransformer: An Architecture for Audio Captioning Based on Learning Temporal and Time-Frequency Information",
abstract = "Automated audio captioning (AAC) is a novel task, where a method takes as an input an audio sample and outputs a textual description (i.e. a caption) of its contents. Most AAC methods are adapted from image captioning or machine translation fields. In this work, we present a novel AAC method, explicitly focused on the exploitation of the temporal and time-frequency patterns in audio. We employ three learnable processes for audio encoding, two for extracting the temporal and time-frequency information, and one to merge the output of the previous two processes. To generate the caption, we employ the widely used Transformer decoder. We assess our method utilizing the freely available splits of the Clotho dataset. Our results increase previously reported highest SPIDEr to 17.3, from 16.2 (higher is better).",
keywords = "Measurement, Time-frequency analysis, Neural networks, Europe, Transformers, Encoding, Decoding, automated audio captioning, wavetransformer, wavenet, transformer",
author = "An Tran and Konstantinos Drossos and Tuomas Virtanen",
note = "jufoid=55867; European Signal Processing Conference, EUSIPCO 2021 ; Conference date: 23-08-2021 Through 27-08-2021",
year = "2021",
doi = "10.23919/EUSIPCO54536.2021.9616340",
language = "English",
series = "European Signal Processing Conference",
publisher = "IEEE",
pages = "576--580",
booktitle = "2021 29th European Signal Processing Conference (EUSIPCO)",
address = "United States",
}