{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:47:56Z","timestamp":1778082476999,"version":"3.51.4"},"reference-count":45,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,4,14]],"date-time":"2024-04-14T00:00:00Z","timestamp":1713052800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,4,14]],"date-time":"2024-04-14T00:00:00Z","timestamp":1713052800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,4,14]]},"DOI":"10.1109\/icassp48485.2024.10445941","type":"proceedings-article","created":{"date-parts":[[2024,3,18]],"date-time":"2024-03-18T18:56:31Z","timestamp":1710788191000},"page":"6890-6894","source":"Crossref","is-referenced-by-count":12,"title":["AV-SUPERB: A Multi-Task Evaluation Benchmark for Audio-Visual Representation Models"],"prefix":"10.1109","author":[{"given":"Yuan","family":"Tseng","sequence":"first","affiliation":[{"name":"National Taiwan University,Taiwan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Layne","family":"Berry","sequence":"additional","affiliation":[{"name":"University of Texas at Austin,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yi-Ting","family":"Chen","sequence":"additional","affiliation":[{"name":"Academia Sinica,Taiwan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"I-Hsiang","family":"Chiu","sequence":"additional","affiliation":[{"name":"National Taiwan University,Taiwan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hsuan-Hao","family":"Lin","sequence":"additional","affiliation":[{"name":"National Taiwan University,Taiwan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Max","family":"Liu","sequence":"additional","affiliation":[{"name":"National Taiwan University,Taiwan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Puyuan","family":"Peng","sequence":"additional","affiliation":[{"name":"University of Texas at Austin,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yi-Jen","family":"Shih","sequence":"additional","affiliation":[{"name":"National Taiwan University,Taiwan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hung-Yu","family":"Wang","sequence":"additional","affiliation":[{"name":"National Taiwan University,Taiwan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Haibin","family":"Wu","sequence":"additional","affiliation":[{"name":"National Taiwan University,Taiwan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Po-Yao","family":"Huang","sequence":"additional","affiliation":[{"name":"Meta AI"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chun-Mao","family":"Lai","sequence":"additional","affiliation":[{"name":"National Taiwan University,Taiwan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shang-Wen","family":"Li","sequence":"additional","affiliation":[{"name":"Meta AI"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"David","family":"Harwath","sequence":"additional","affiliation":[{"name":"University of Texas at Austin,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yu","family":"Tsao","sequence":"additional","affiliation":[{"name":"Academia Sinica,Taiwan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Abdelrahman","family":"Mohamed","sequence":"additional","affiliation":[{"name":"Rembrand"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chi-Luen","family":"Feng","sequence":"additional","affiliation":[{"name":"National Taiwan University,Taiwan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hung-Yi","family":"Lee","sequence":"additional","affiliation":[{"name":"National Taiwan University,Taiwan"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume-title":"NeurIPS","author":"Baevski"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN52387.2021.9534474"},{"key":"ref4","article-title":"Masked autoencoders that listen","volume-title":"NeurIPS","author":"Huang"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2010.11929"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-1775"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.580"},{"key":"ref9","article-title":"Hear: Holistic evaluation of audio representations","volume-title":"NeurIPS 2021 Competitions and Demonstrations Track","author":"Turian"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19830-4_36"},{"key":"ref11","article-title":"Benchmarking self-supervised video representation learning","author":"Kumar","year":"2023"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1038\/264746a0"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1016\/j.tics.2006.04.008"},{"key":"ref14","article-title":"Soundnet: Learning sound representations from unlabeled video","volume-title":"NeurIPS","author":"Aytar"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.73"},{"key":"ref16","article-title":"Cooperative learning of audio and video models from self-supervised synchronization","volume-title":"NeurIPS","author":"Korbar"},{"key":"ref17","article-title":"Self-supervised learning by crossmodal audio-video clustering","volume-title":"NeurIPS","author":"Alwassel"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01039"},{"key":"ref19","article-title":"The kinetics human action video dataset","author":"Kay","year":"2017"},{"key":"ref20","article-title":"Ucf101: A dataset of 101 human actions classes from videos in the wild","author":"Soomro","year":"2012"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-021-01531-2"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1121\/1.2229005"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.367"},{"key":"ref24","article-title":"Lrs3-ted: a large-scale dataset for visual speech recognition","author":"Afouras","year":"2018"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2017-950"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2018-1929"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00331"},{"key":"ref28","doi-asserted-by":"crossref","DOI":"10.1109\/SLT54892.2023.10022770","article-title":"Superb@ slt 2022: Challenge on generalization and efficiency of self-supervised speech representation learning","volume-title":"SLT","author":"Feng"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01842"},{"key":"ref30","article-title":"VLUE: A multi-task multi-dimension benchmark for evaluating vision-language pretraining","volume-title":"ICML","author":"Zhou"},{"key":"ref31","article-title":"Value: A multi-task benchmark for video-and-language understanding evaluation","volume-title":"NeurIPS Track on Datasets and Benchmarks","author":"Li"},{"key":"ref32","article-title":"Multibench: Multiscale benchmarks for multimodal representation learning","volume-title":"NeurIPS Track on Datasets and Benchmarks","author":"Liang"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2003.817150"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054057"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1007\/s00138-018-0960-9"},{"key":"ref36","article-title":"Learning audio-visual speech representation by masked multimodal cluster prediction","volume-title":"ICLR","author":"Shi"},{"key":"ref37","article-title":"Learning state-aware visual representations from audible interactions","volume-title":"NeurIPS","author":"Mittal"},{"key":"ref38","article-title":"Parameter efficient multimodal transformers for video representation learning","volume-title":"ICLR","author":"Lee"},{"key":"ref39","article-title":"Mavil: Masked audio-video learners","volume-title":"NeurIPS","author":"Huang"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688093"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref44","article-title":"Sentence encoders on stilts: Supplementary training on intermediate labeled-data tasks","author":"Phang","year":"2018"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1439"}],"event":{"name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","location":"Seoul, Korea, Republic of","start":{"date-parts":[[2024,4,14]]},"end":{"date-parts":[[2024,4,19]]}},"container-title":["ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10445798\/10445803\/10445941.pdf?arnumber=10445941","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,2]],"date-time":"2024-08-02T04:31:19Z","timestamp":1722573079000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10445941\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4,14]]},"references-count":45,"URL":"https:\/\/doi.org\/10.1109\/icassp48485.2024.10445941","relation":{},"subject":[],"published":{"date-parts":[[2024,4,14]]}}}