{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T01:10:24Z","timestamp":1755825024401,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":47,"publisher":"ACM","funder":[{"name":"National Natural Science Foundation of China (NSFC)","award":["62225207, 62436008, 62422609, 62276243"],"award-info":[{"award-number":["62225207, 62436008, 62422609, 62276243"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1145\/3731715.3733320","type":"proceedings-article","created":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T18:29:43Z","timestamp":1750876183000},"page":"506-515","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Efficient Prompt-based Multimodal Interaction for Audio-Visual Event Localization"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-2366-0764","authenticated-orcid":false,"given":"Longzhuo","family":"Huang","sequence":"first","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-2440-2905","authenticated-orcid":false,"given":"Liang","family":"Li","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8036-4071","authenticated-orcid":false,"given":"Xueyang","family":"Fu","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2510-8993","authenticated-orcid":false,"given":"Zhengjun","family":"Zha","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}]}],"member":"320","published-online":{"date-parts":[[2025,6,30]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","unstructured":"Anurag Arnab Mostafa Dehghani Georg Heigold Chen Sun Mario Lucic and Cordelia Schmid. 2021. ViViT: A Video Vision Transformer. doi:10.48550\/arXiv.2103.15691 arXiv:2103.15691 [cs]","DOI":"10.48550\/arXiv.2103.15691"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly Jakob Uszkoreit and Neil Houlsby. 2021. An Image Is Worth 16x16 Words: Transformers for Image Recognition at Scale. doi:10.48550\/arXiv.2010.11929 arXiv:2010.11929 [cs]","DOI":"10.48550\/arXiv.2010.11929"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00406"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","unstructured":"Haoyi Duan Yan Xia Mingze Zhou Li Tang Jieming Zhu and Zhou Zhao. 2023. Cross-Modal Prompts: Adapting Large Pre-trained Models for Audio-Visual Downstream Tasks. doi:10.48550\/arXiv.2311.05152 arXiv:2311.05152 [cs]","DOI":"10.48550\/arXiv.2311.05152"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","unstructured":"Deniz Engin and Yannis Avrithis. 2023. Zero-Shot and Few-Shot Video Question Answering with Multi-Modal Prompts. doi:10.48550\/arXiv.2309.15915 arXiv:2309.15915 [cs]","DOI":"10.48550\/arXiv.2309.15915"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612017"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2307.01146"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612506"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","unstructured":"Andrey Guzhov Federico Raue J\u00f6rn Hees and Andreas Dengel. 2021. Audio-CLIP: Extending CLIP to Image Text and Audio. doi:10.48550\/arXiv.2106.13043 arXiv:2106.13043 [cs eess]","DOI":"10.48550\/arXiv.2106.13043"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","unstructured":"Cheng Han Qifan Wang Yiming Cui Zhiwen Cao Wenguan Wang Siyuan Qi and Dongfang Liu. 2023. E2VPT: An Effective and Efficient Approach for Visual Prompt Tuning. doi:10.48550\/arXiv.2307.13770 arXiv:2307.13770 [cs]","DOI":"10.48550\/arXiv.2307.13770"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681503"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","unstructured":"Menglin Jia Luming Tang Bor-Chun Chen Claire Cardie Serge Belongie Bharath Hariharan and Ser-Nam Lim. 2022. Visual Prompt Tuning. doi:10.48550\/arXiv.2203.12119 arXiv:2203.12119 [cs]","DOI":"10.48550\/arXiv.2203.12119"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","unstructured":"Muhammad Uzair Khattak Hanoona Rasheed Muhammad Maaz Salman Khan and Fahad Shahbaz Khan. 2023. MaPLe: Multi-modal Prompt Learning. doi:10.48550\/arXiv.2210.03117 arXiv:2210.03117 [cs]","DOI":"10.48550\/arXiv.2210.03117"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1412.6980"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","unstructured":"Brian Lester Rami Al-Rfou and Noah Constant. 2021. The Power of Scale for Parameter-Efficient Prompt Tuning. doi:10.48550\/arXiv.2104.08691 arXiv:2104.08691 [cs]","DOI":"10.48550\/arXiv.2104.08691"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","unstructured":"Xiang Lisa Li and Percy Liang. 2021. Prefix-Tuning: Optimizing Continuous Prompts for Generation. doi:10.48550\/arXiv.2101.00190 arXiv:2101.00190 [cs]","DOI":"10.48550\/arXiv.2101.00190"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683226"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","unstructured":"Yan-Bo Lin Yi-Lin Sung Jie Lei Mohit Bansal and Gedas Bertasius. 2023. Vision Transformers Are Parameter-Efficient Audio-Visual Learners. doi:10.48550\/arXiv. 2212.07983 arXiv:2212.07983 [cs eess]","DOI":"10.48550\/arXiv"},{"key":"e_1_3_2_1_20_1","volume-title":"Proceedings of the Asian Conference on Computer Vision.","author":"Lin Yan-Bo","year":"2020","unstructured":"Yan-Bo Lin and Yu-Chiang Frank Wang. 2020. Audiovisual Transformer with Instance Attention for Audio-Visual Event Localization. In Proceedings of the Asian Conference on Computer Vision."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","unstructured":"Pengfei Liu Weizhe Yuan Jinlan Fu Zhengbao Jiang Hiroaki Hayashi and Graham Neubig. 2021. Pre-Train Prompt and Predict: A Systematic Survey of Prompting Methods in Natural Language Processing. doi:10.48550\/arXiv.2107.13586 arXiv:2107.13586 [cs]","DOI":"10.48550\/arXiv.2107.13586"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","unstructured":"Xiao Liu Yanan Zheng Zhengxiao Du Ming Ding Yujie Qian Zhilin Yang and Jie Tang. 2023. GPT Understands Too. doi:10.48550\/arXiv.2103.10385 arXiv:2103.10385 [cs]","DOI":"10.48550\/arXiv.2103.10385"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"crossref","unstructured":"Ze Liu Han Hu Yutong Lin Zhuliang Yao Zhenda Xie YixuanWei Jia Ning Yue Cao Zheng Zhang Li Dong FuruWei and Baining Guo. 2022. Swin Transformer V2: Scaling Up Capacity and Resolution. arXiv:2111.09883 [cs.CV]","DOI":"10.1109\/CVPR52688.2022.01170"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00513"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3327605"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1912.01703"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","unstructured":"Guanghui Qin and Jason Eisner. 2021. Learning How to Ask: Querying LMs with Mixtures of Soft Prompts. doi:10.48550\/arXiv.2104.06599 arXiv:2104.06599 [cs]","DOI":"10.48550\/arXiv.2104.06599"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2103.00020"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053895"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19830-4_39"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","unstructured":"Pengwei Tang Xiaolin Hu and Yong Liu. 2025. ADePT: Adaptive Decomposed Prompt Tuning for Parameter-Efficient Fine-tuning. doi:10.48550\/arXiv.2501. 03291 arXiv:2501.03291 [cs]","DOI":"10.48550\/arXiv.2501"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","unstructured":"Yapeng Tian Jing Shi Bochen Li Zhiyao Duan and Chenliang Xu. 2018. Audio-Visual Event Localization in Unconstrained Videos. doi:10.48550\/arXiv.1803.08842 arXiv:1803.08842 [cs]","DOI":"10.48550\/arXiv.1803.08842"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3376399"},{"key":"e_1_3_2_1_34_1","first-page":"2579","article-title":"Visualizing Data Using T-SNE","volume":"9","author":"van der Maaten Laurens","year":"2008","unstructured":"Laurens van der Maaten and Geoffrey Hinton. 2008. Visualizing Data Using T-SNE. Journal of Machine Learning Research 9, 86 (2008), 2579--2605.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3277462"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","unstructured":"Zhen Wang Rameswar Panda Leonid Karlinsky Rogerio Feris Huan Sun and Yoon Kim. 2023. Multitask Prompt Tuning Enables Parameter-Efficient Transfer Learning. doi:10.48550\/arXiv.2303.02861 arXiv:2303.02861 [cs]","DOI":"10.48550\/arXiv.2303.02861"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00639"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01936"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413581"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i01.5361"},{"key":"e_1_3_2_1_41_1","unstructured":"Jiashuo Yu Ying Cheng Rui-Wei Zhao Rui Feng and Yuejie Zhang. 2021. MMPyramid: Multimodal Pyramid Attentional Network for Audio-Visual Event Localization and Video Parsing. https:\/\/arxiv.org\/abs\/2111.12374v2."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681232"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","unstructured":"Donghuo Zeng and Kazushi Ikeda. 2023. Two-Stage Triplet Loss Training with Curriculum Augmentation for Audio-Visual Retrieval. doi:10.48550\/arXiv.2310.13451 arXiv:2310.13451","DOI":"10.48550\/arXiv.2310.13451"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3429192"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1007\/s40747-024-01654-2"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681492"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","unstructured":"Jinxing Zhou Liang Zheng Yiran Zhong Shijie Hao and Meng Wang. 2021. Positive Sample Propagation along the Audio-Visual Event Line. doi:10.48550\/arXiv.2104.00239 arXiv:2104.00239 [cs eess]","DOI":"10.48550\/arXiv.2104.00239"}],"event":{"name":"ICMR '25: International Conference on Multimedia Retrieval","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Chicago IL USA","acronym":"ICMR '25"},"container-title":["Proceedings of the 2025 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731715.3733320","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T04:09:10Z","timestamp":1755749350000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731715.3733320"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":47,"alternative-id":["10.1145\/3731715.3733320","10.1145\/3731715"],"URL":"https:\/\/doi.org\/10.1145\/3731715.3733320","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]},"assertion":[{"value":"2025-06-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}