{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T01:10:26Z","timestamp":1755825026665,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":31,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100006374","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62306064"],"award-info":[{"award-number":["62306064"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Sichuan Central-Guided Local Science and Technology Development","award":["2023ZYD0165"],"award-info":[{"award-number":["2023ZYD0165"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1145\/3731715.3733423","type":"proceedings-article","created":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T18:31:39Z","timestamp":1750876299000},"page":"164-172","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["RobustPT: Dynamic Disentanglement Prompt Tuning in Vision-Language Models with Missing Modalities"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8944-6759","authenticated-orcid":false,"given":"Ruiting","family":"Dai","sequence":"first","affiliation":[{"name":"University of Electronic Science and Technology of China, ChengDu, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0693-5230","authenticated-orcid":false,"given":"Yuqiao","family":"Tan","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, ChengDu, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-4742-4456","authenticated-orcid":false,"given":"Lisi","family":"Mo","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, ChengDu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8676-7429","authenticated-orcid":false,"given":"Tao","family":"He","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, ChengDu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6174-3877","authenticated-orcid":false,"given":"Ke","family":"Qin","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, ChengDu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7387-2801","authenticated-orcid":false,"given":"Shuang","family":"Liang","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, ChengDu, China"}]}],"member":"320","published-online":{"date-parts":[[2025,6,30]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Luis Enrique Erro No, Sta Ma Tonantzintla, and Fabio A Gonz\u00e1lez.","author":"Arevalo John","year":"2017","unstructured":"John Arevalo, Thamar Solorio, Manuel Montes-y G\u00f3mez, Luis Enrique Erro No, Sta Ma Tonantzintla, and Fabio A Gonz\u00e1lez. 2017. GATED MULTIMODAL UNITS FOR INFORMATION FU. stat, Vol. 1050 (2017), 7."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00700"},{"key":"e_1_3_2_1_3_1","volume-title":"Muap: Multi-step adaptive prompt learning for vision-language model with missing modality. arXiv preprint arXiv:2409.04693","author":"Dai Ruiting","year":"2024","unstructured":"Ruiting Dai, Yuqiao Tan, Lisi Mo, Tao He, Ke Qin, and Shuang Liang. 2024a. Muap: Multi-step adaptive prompt learning for vision-language model with missing modality. arXiv preprint arXiv:2409.04693 (2024)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3652583.3658040"},{"key":"e_1_3_2_1_5_1","unstructured":"Zirun Guo Tao Jin and Zhou Zhao. 2024. Multimodal Prompt Learning with Missing Modalities for Sentiment Analysis and Emotion Recognition. arxiv: 2407.05374 [cs.CL] https:\/\/arxiv.org\/abs\/2407.05374"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2019.8803025"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2019.01.019"},{"key":"e_1_3_2_1_9_1","unstructured":"Jaehyuk Jang Yooseung Wang and Changick Kim. 2023. Towards Robust Multimodal Prompting With Missing Modalities. arxiv: 2312.15890 [cs.CV] https:\/\/arxiv.org\/abs\/2312.15890"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3060167"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01832"},{"key":"e_1_3_2_1_13_1","volume-title":"The hateful memes challenge: Detecting hate speech in multimodal memes. Advances in neural information processing systems","author":"Kiela Douwe","year":"2020","unstructured":"Douwe Kiela, Hamed Firooz, Aravind Mohan, Vedanuj Goswami, Amanpreet Singh, Pratik Ringshia, and Davide Testuggine. 2020. The hateful memes challenge: Detecting hate speech in multimodal memes. Advances in neural information processing systems, Vol. 33 (2020), 2611--2624."},{"key":"e_1_3_2_1_14_1","volume-title":"International conference on machine learning. PMLR, 5583--5594","author":"Kim Wonjae","year":"2021","unstructured":"Wonjae Kim, Bokyung Son, and Ildoo Kim. 2021. Vilt: Vision-and-language transformer without convolution or region supervision. In International conference on machine learning. PMLR, 5583--5594."},{"key":"e_1_3_2_1_15_1","unstructured":"Alexander Kolesnikov Alexey Dosovitskiy Dirk Weissenborn Georg Heigold Lucas Beyer Matthias Minderer Mostafa Dehghani Neil Houlsby Thomas Unterthiner and Xiaohua Zhai. [n. d.]. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. ( [n. d.])."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01435"},{"key":"e_1_3_2_1_17_1","volume-title":"Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557","author":"Li Liunian Harold","year":"2019","unstructured":"Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, and Kai-Wei Chang. 2019. Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557 (2019)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2019.00202"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2023.101973"},{"key":"e_1_3_2_1_20_1","volume-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_21_1","volume-title":"ImageBERT: Cross-modal Pre-training with Large-scale Weak-supervised Image-Text Data. arXiv e-prints","author":"Qi Di","year":"2020","unstructured":"Di Qi, Lin Su, Jia Song, Edward Cui, Taroon Bharti, and Arun Sacheti. 2020. ImageBERT: Cross-modal Pre-training with Large-scale Weak-supervised Image-Text Data. arXiv e-prints (2020), arXiv--2001."},{"key":"e_1_3_2_1_22_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_23_1","volume-title":"VL-BERT: Pre-training of Generic Visual-Linguistic Representations. In International Conference on Learning Representations.","author":"Su Weijie","year":"2019","unstructured":"Weijie Su, Xizhou Zhu, Yue Cao, Bin Li, Lewei Lu, Furu Wei, and Jifeng Dai. 2019. VL-BERT: Pre-training of Generic Visual-Linguistic Representations. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00146"},{"key":"e_1_3_2_1_25_1","volume-title":"Better wit than wealth: Dynamic Parametric Retrieval Augmented Generation for Test-time Knowledge Enhancement. arXiv preprint arXiv:2503.23895","author":"Tan Yuqiao","year":"2025","unstructured":"Yuqiao Tan, Shizhu He, Huanxuan Liao, Jun Zhao, and Kang Liu. 2025. Better wit than wealth: Dynamic Parametric Retrieval Augmented Generation for Test-time Knowledge Enhancement. arXiv preprint arXiv:2503.23895 (2025)."},{"key":"e_1_3_2_1_26_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICMEW.2015.7169757"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00644"},{"key":"e_1_3_2_1_29_1","volume-title":"Tip-adapter: Training-free clip-adapter for better vision-language modeling. arXiv preprint arXiv:2111.03930","author":"Zhang Renrui","year":"2021","unstructured":"Renrui Zhang, Rongyao Fang, Wei Zhang, Peng Gao, Kunchang Li, Jifeng Dai, Yu Qiao, and Hongsheng Li. 2021. Tip-adapter: Training-free clip-adapter for better vision-language modeling. arXiv preprint arXiv:2111.03930 (2021)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"}],"event":{"name":"ICMR '25: International Conference on Multimedia Retrieval","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Chicago IL USA","acronym":"ICMR '25"},"container-title":["Proceedings of the 2025 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731715.3733423","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T04:15:20Z","timestamp":1755749720000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731715.3733423"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":31,"alternative-id":["10.1145\/3731715.3733423","10.1145\/3731715"],"URL":"https:\/\/doi.org\/10.1145\/3731715.3733423","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]},"assertion":[{"value":"2025-06-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}