{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T09:59:49Z","timestamp":1777888789347,"version":"3.51.4"},"reference-count":90,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100002347","name":"BMBF","doi-asserted-by":"publisher","award":["01IS24060,01I524085B,01IS18039A"],"award-info":[{"award-number":["01IS24060,01I524085B,01IS18039A"]}],"id":[{"id":"10.13039\/501100002347","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001659","name":"DFG","doi-asserted-by":"publisher","award":["276693517"],"award-info":[{"award-number":["276693517"]}],"id":[{"id":"10.13039\/501100001659","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.00103","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"1027-1037","source":"Crossref","is-referenced-by-count":0,"title":["VGGSounder: Audio-Visual Evaluations for Foundation Models"],"prefix":"10.1109","author":[{"given":"Daniil","family":"Zverev","sequence":"first","affiliation":[{"name":"Technical University of Munich, MCML"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Th\u00e4ddaus","family":"Wiedemer","sequence":"additional","affiliation":[{"name":"University of T&#x00FC;bingen"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ameya","family":"Prabhu","sequence":"additional","affiliation":[{"name":"University of T&#x00FC;bingen"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Matthias","family":"Bethge","sequence":"additional","affiliation":[{"name":"University of T&#x00FC;bingen"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wieland","family":"Brendel","sequence":"additional","affiliation":[{"name":"University of T&#x00FC;bingen"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"A. Sophia","family":"Koepke","sequence":"additional","affiliation":[{"name":"Technical University of Munich, MCML"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Youtube-8m: A largescale video classification benchmark","author":"Abu-El-Haija","year":"2016","journal-title":"arXiv preprint"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2018.2889052"},{"key":"ref3","article-title":"Self-supervised object detection from audio-visual correspondence","author":"Afouras","year":"2020","journal-title":"ECCV"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054253"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58523-5_13"},{"key":"ref6","article-title":"Self-supervised learning by cross-modal audio-video clustering","author":"Alwassel","year":"2020","journal-title":"NeurIPS"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02009"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.73"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_27"},{"key":"ref10","article-title":"Labelling unlabelled videos from scratch with multi-modal self-supervision","author":"Asano","year":"2020","journal-title":"NeurIPS"},{"key":"ref11","article-title":"Soundnet: Learning sound representations from unlabeled video","author":"Aytar","year":"2016","journal-title":"NeurIPS"},{"key":"ref12","author":"Beyer","year":"2020","journal-title":"Are we done with imagenet? arXiv preprint"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"ref14","article-title":"Audio-visual synchronisation in the wild","author":"Chen","year":"2021","journal-title":"BMVC"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01659"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.3009820"},{"key":"ref17","article-title":"Beats: Audio pre-training with acoustic tokenizers","author":"Chen","year":"2022","journal-title":"arXiv preprint"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00694"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01749"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413869"},{"key":"ref21","article-title":"Videollama 2: Advancing spatialtemporal modeling and audio understanding in video-llmss","author":"Cheng","year":"2024","journal-title":"arXiv preprint"},{"key":"ref22","author":"Chiang","year":"2023","journal-title":"Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality."},{"key":"ref23","article-title":"Out of time: automated lip sync in the wild","author":"Chung","year":"2016","journal-title":"ACCV"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref25","article-title":"Detection of audio-video synchronization errors via event detection","author":"Ebeneze","year":"2021","journal-title":"ICASSP"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58621-8_44"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00398"},{"key":"ref28","author":"Pradipta Gema","year":"2024","journal-title":"Are we done with mmlu? arXiv preprint"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/icassp.2017.7952261"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"ref31","article-title":"Guitar music transcription from silent video","author":"Goldstein","year":"2018","journal-title":"BMVC"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2022.3224688"},{"key":"ref33","article-title":"Contrastive audio-visual masked autoencoder","author":"Gong","year":"2023","journal-title":"ICLR"},{"key":"ref34","article-title":"A survey on 11 m -as-a-judge","author":"Gu","year":"2024","journal-title":"arXiv preprint"},{"key":"ref35","article-title":"Aligning ai with shared human values","author":"Hendrycks","year":"2021","journal-title":"ICLR"},{"key":"ref36","article-title":"Measuring massive multitask language understanding","author":"Hendrycks","year":"2021","journal-title":"ICLR"},{"key":"ref37","article-title":"Sparse in space and time: Audio-visual synchronisation with trainable selectors","author":"Iashin","year":"2022","journal-title":"BMVC"},{"key":"ref38","article-title":"Sparse in space and time: Audio-visual synchronisation with trainable selectors","author":"Iashin","year":"2022","journal-title":"BMVC"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448489"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-019-01150-y"},{"key":"ref41","article-title":"The kinetics human action video dataset","author":"Kay","year":"2017","journal-title":"arXiv preprint"},{"key":"ref42","article-title":"On attention modules for audio-visual synchronization","volume-title":"CVPR Workshop","author":"Khosravan","year":"2019"},{"key":"ref43","article-title":"Equiav: Leveraging equivariance for audio-visual contrastive learning","author":"Kim","year":"2024","journal-title":"ICML"},{"key":"ref44","article-title":"Visual pitch estimation","author":"Sophia","year":"2019","journal-title":"SMC"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053115"},{"key":"ref46","article-title":"Cooperative learning of audio and video models from self-supervised synchronization","author":"Korbar","year":"2018","journal-title":"NeurIPS"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01011"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01852"},{"key":"ref49","article-title":"Siamese vision transformers are scalable audio-visual learners","author":"Lin","year":"2024","journal-title":"ECCV"},{"key":"ref50","article-title":"Audiovisual transformer with instance attention for audio-visual event localization","author":"Lin","year":"2020","journal-title":"ACCV"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/icassp.2019.8683226"},{"key":"ref52","article-title":"Ola: Pushing the frontiers of omni-modal language model","author":"Liu","year":"2025","journal-title":"arXiv preprint"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02497"},{"key":"ref54","article-title":"Look, listen, and answer: Overcoming biases for audio-visual question answering","author":"Ma","year":"2024","journal-title":"NeurIPS"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19836-6_13"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02567"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.5244\/C.34.10"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054057"},{"key":"ref60","article-title":"Attention bottlenecks for multimodal fusion","author":"Nagrani","year":"2021","journal-title":"NeurIPS"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/wacv51458.2022.00058"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_39"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_48"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-018-1083-5"},{"key":"ref65","article-title":"Multi-modal self-supervision from generalized data transformations","author":"Patrick","year":"2020","journal-title":"NeurIPS"},{"key":"ref66","article-title":"Visual keyword spotting with attention","author":"Prajwal","year":"2021","journal-title":"BMVC"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58565-5_18"},{"key":"ref68","article-title":"Robust speech recognition via large-scale weak supervision","author":"Radford","year":"2023","journal-title":"ICLR"},{"key":"ref69","article-title":"Multiinstrumentalist net: Unsupervised generation of music from body movements","author":"Su","year":"2020","journal-title":"arXiv preprint"},{"key":"ref70","author":"Su","year":"2021","journal-title":"How does it sound? In NeurIPS"},{"key":"ref71","article-title":"From vision to audio and beyond: A unified model for audio-visual representation and generation","author":"Su","year":"2024","journal-title":"ICML"},{"key":"ref72","article-title":"Pandagpt: One model to instruction-follow them all","author":"Su","year":"2023","journal-title":"arXiv preprint"},{"key":"ref73","article-title":"Gemini: a family of highly capable multimodal models","author":"Team","year":"2023","journal-title":"arXiv preprint"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_16"},{"key":"ref75","article-title":"Into the wild with audioscope: Unsupervised audio-visual separation of on-screen sounds","author":"Tzinis","year":"2021","journal-title":"ICLR"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10890587"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_41"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00639"},{"key":"ref79","article-title":"Audiovisual slowfast networks for video recognition","author":"Xiao","year":"2020","journal-title":"arXiv preprint"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413581"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72698-9_8"},{"key":"ref82","article-title":"Qwen3 technical report","author":"Yang","year":"2025","journal-title":"arXiv preprint"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548291"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00204"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_35"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00182"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00037"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00374"},{"key":"ref89","article-title":"Languagebind: Extending video-language pretraining to n-modality by language-based semantic alignment","author":"Zhu","year":"2023","journal-title":"arXiv preprint"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00224"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11446094.pdf?arnumber=11446094","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T04:58:19Z","timestamp":1777611499000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11446094\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":90,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.00103","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}