@inproceedings{c79cefd93bce4cbe8859900b2c7d1955,
title = "Synchformer: Efficient Synchronization From Sparse Cues",
abstract = "Our objective is audio-visual synchronization with a focus on {\textquoteleft}in-the-wild{\textquoteright} videos, such as those on YouTube, where synchronization cues can be sparse. Our contributions include a novel audio-visual synchronization model, and training that decouples feature extraction from synchronization modelling through multi-modal segment-level contrastive pre-training. This approach achieves state-of-the-art performance in both dense and sparse settings. We also extend synchronization model training to AudioSet a million-scale {\textquoteleft}in-the-wild{\textquoteright} dataset, investigate evidence attribution techniques for interpretability, and explore a new capability for synchronization models: audio-visual synchronizability. robots.ox.ac.uk/~vgg/research/synchformer",
author = "Vladimir Iashin and Weidi Xie and Esa Rahtu and Andrew Zisserman",
year = "2024",
doi = "10.1109/ICASSP48485.2024.10448489",
language = "English",
publisher = "IEEE",
pages = "5325--5329",
booktitle = "ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)",
address = "United States",
note = "IEEE International Conference on Acoustics, Speech and Signal Processing ; Conference date: 14-04-2024 Through 19-04-2024",
}