{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T17:45:26Z","timestamp":1776879926462,"version":"3.51.2"},"reference-count":102,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"4","license":[{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U25A20530"],"award-info":[{"award-number":["U25A20530"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["72188101"],"award-info":[{"award-number":["72188101"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62573399"],"award-info":[{"award-number":["62573399"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"New Generation Artificial Intelligence-National Science and Technology Major Project","award":["2025ZD0123101"],"award-info":[{"award-number":["2025ZD0123101"]}]},{"name":"Outstanding Youth Fund of Shandong Province","award":["ZR2021YQ44"],"award-info":[{"award-number":["ZR2021YQ44"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2026,4]]},"DOI":"10.1109\/tpami.2025.3642821","type":"journal-article","created":{"date-parts":[[2025,12,11]],"date-time":"2025-12-11T18:45:23Z","timestamp":1765478723000},"page":"4222-4238","source":"Crossref","is-referenced-by-count":1,"title":["Mettle: Meta-Token Learning for Memory-Efficient Audio-Visual Adaptation"],"prefix":"10.1109","volume":"48","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6402-7593","authenticated-orcid":false,"given":"Jinxing","family":"Zhou","sequence":"first","affiliation":[{"name":"Hefei Comprehensive National Science Center, Hefei University of Technology (HFUT), Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9642-8009","authenticated-orcid":false,"given":"Zhihui","family":"Li","sequence":"additional","affiliation":[{"name":"Department of Electronic Engineering and Information Science, School of Information Science and Technology, University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yongqiang","family":"Yu","sequence":"additional","affiliation":[{"name":"Department of Computer Vision, Mohamed bin Zayed University of Artificial Intelligence (MBZUAI), Abu Dhabi, United Arab Emirates"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-9401-4432","authenticated-orcid":false,"given":"Yanghao","family":"Zhou","sequence":"additional","affiliation":[{"name":"Department of Electrical and Computer Engineering, National University of Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1091-272X","authenticated-orcid":false,"given":"Ruohao","family":"Guo","sequence":"additional","affiliation":[{"name":"School of Intelligence Science and Technology, Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Guangyao","family":"Li","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuxin","family":"Mao","sequence":"additional","affiliation":[{"name":"OpenNLPLab, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0040-6177","authenticated-orcid":false,"given":"Mingfei","family":"Han","sequence":"additional","affiliation":[{"name":"Department of Computer Vision, Mohamed bin Zayed University of Artificial Intelligence (MBZUAI), Abu Dhabi, United Arab Emirates"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaojun","family":"Chang","sequence":"additional","affiliation":[{"name":"Department of Electronic Engineering and Information Science, School of Information Science and Technology, University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3094-7735","authenticated-orcid":false,"given":"Meng","family":"Wang","sequence":"additional","affiliation":[{"name":"Hefei Comprehensive National Science Center, Hefei University of Technology (HFUT), Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.52202\/075280-2445"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.52202\/079017-0007"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_16"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58580-8_26"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19836-6_22"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01170"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746312"},{"key":"ref8","article-title":"Learning in audio-visual context: A review, analysis, and new perspective","author":"Wei","year":"2022"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00228"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2010.11929"},{"key":"ref11","first-page":"2790","article-title":"Parameter-efficient transfer learning for NLP","volume-title":"Proc. IEEE Int. Conf. Mach. Learn. Appl.","author":"Houlsby","year":"2019"},{"key":"ref12","first-page":"12116","article-title":"Do vision transformers see like convolutional neural networks?","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Raghu","year":"2021"},{"key":"ref13","first-page":"1","article-title":"On the relationship between self-attention and convolutional layers","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Cordonnier","year":"2020"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01852"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_27"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413869"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i10.33185"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/3721981"},{"key":"ref20","article-title":"CLASP: Cross-modal salient anchor-based semantic propagation for weakly-supervised dense audio-visual event localization","author":"Zhou","year":"2025"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_35"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01659"},{"key":"ref23","first-page":"10077","article-title":"Discriminative sounding objects localization via self-supervised audiovisual matching","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Hu","year":"2020"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02525"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-024-02261-x"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681586"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01265"},{"key":"ref28","article-title":"Think before you segment: An object-aware reasoning agent for referring audio-visual segmentation","author":"Zhou","year":"2025"},{"key":"ref29","article-title":"Simtoken: A simple baseline for referring audio-visual segmentation","author":"Jin","year":"2025"},{"key":"ref30","article-title":"An attempt towards interpretable audio-visual video captioning","author":"Tian","year":"2018"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01020"},{"key":"ref32","first-page":"9","article-title":"Audio-visual interpretable and controllable video captioning","volume-title":"Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit. Workshops","author":"Tian","year":"2019"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2022.3175012"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680612"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-526"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00204"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i4.28116"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612293"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680803"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i5.32538"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683226"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00833"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3223688"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-69544-6_17"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19830-4_39"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01936"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00783"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00138"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19830-4_25"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01805"},{"key":"ref51","first-page":"1","article-title":"Modality-independent teachers meet weakly-supervised audio-visual event parser","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Lai","year":"2023"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73247-8_1"},{"key":"ref53","article-title":"Improving audio-visual video parsing with pseudo visual labels","author":"Zhou","year":"2023"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72684-2_3"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-024-02142-3"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i10.33134"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611724"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i11.29104"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02487"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2024.127885"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73464-9_19"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02502"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1145\/3735975"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1165"},{"key":"ref65","first-page":"1022","article-title":"Compacter: Efficient low-rank hypercomplex adapter layers","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Mahabadi","year":"2021"},{"key":"ref66","first-page":"1","article-title":"QA-LoRA: Quantization-aware low-rank adaptation of large language models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Xu","year":"2024"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.576"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.353"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00918"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"ref72","first-page":"1","article-title":"Compositional prompt tuning with motion cues for open-vocabulary video relation detection","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Gao","year":"2023"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01166"},{"key":"ref74","first-page":"1","article-title":"Fine-grained visual prompting","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Yang","year":"2024"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-024-02179-4"},{"key":"ref76","first-page":"1","article-title":"PLOT: Prompt learning with optimal transport for vision-language models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Chen","year":"2024"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00635"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01832"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"ref80","article-title":"Consistency-guided prompt learning for vision-language models","author":"Roy","year":"2023"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01394"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01228"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02212"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00190"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00798"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681492"},{"key":"ref87","article-title":"Mixed precision training","author":"Micikevicius","year":"2017"},{"key":"ref88","first-page":"7686","article-title":"Training deep neural networks with 8-bit floating point numbers","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Wang","year":"2018"},{"key":"ref89","first-page":"2214","article-title":"The reversible residual network: Backpropagation without storing activations","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Gomez","year":"2017"},{"key":"ref90","first-page":"12991","article-title":"LST: Ladder side-tuning for parameter and memory efficient transfer learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Sung","year":"2022"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02714"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72784-9_5"},{"key":"ref93","article-title":"What do vision transformers learn? a visual exploration","author":"Ghiasi","year":"2022"},{"key":"ref94","first-page":"67:1","article-title":"Are all layers created equal?","volume":"23","author":"Zhang","year":"2022","journal-title":"J. Mach. Learn. Res."},{"key":"ref95","article-title":"Interpreting and explaining deep neural networks for classification of audio signals","author":"Becker","year":"2018"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547869"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413581"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01760"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.52202\/068431-2516"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"ref101","article-title":"VideoLLaMA 3: Frontier multimodal foundation models for image and video understanding","author":"Zhang","year":"2025"},{"key":"ref102","article-title":"Qwen2.5-Omni technical report","author":"Xu","year":"2025"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/34\/11424231\/11297850.pdf?arnumber=11297850","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T01:34:05Z","timestamp":1773106445000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11297850\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4]]},"references-count":102,"journal-issue":{"issue":"4"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2025.3642821","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,4]]}}}