{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:14:39Z","timestamp":1765340079619,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":40,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["92367202"],"award-info":[{"award-number":["92367202"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"the NSFC Joint Fund with Zhejiang Integration of Informatization and Industrialization under Key Project","award":["U22A2033"],"award-info":[{"award-number":["U22A2033"]}]},{"name":"the Postdoctoral Fellowship Program of CPSF","award":["GZC20251643"],"award-info":[{"award-number":["GZC20251643"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755216","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:51Z","timestamp":1761377211000},"page":"3847-3855","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Reliable Cross-modal Alignment via Prototype Iterative Construction"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4963-8705","authenticated-orcid":false,"given":"Xiang","family":"Ma","sequence":"first","affiliation":[{"name":"Shandong University, Jinan, Shandong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-5588-5642","authenticated-orcid":false,"given":"Litian","family":"Xu","sequence":"additional","affiliation":[{"name":"The University of Exeter, Exeter, Devon, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3243-851X","authenticated-orcid":false,"given":"Lexin","family":"Fang","sequence":"additional","affiliation":[{"name":"Shandong University, Jinan, Shandong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0217-1543","authenticated-orcid":false,"given":"Caiming","family":"Zhang","sequence":"additional","affiliation":[{"name":"Shandong University, Jinan, Shandong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8262-8883","authenticated-orcid":false,"given":"Lizhen","family":"Cui","sequence":"additional","affiliation":[{"name":"Shandong University, Jinan, China and The Joint SDU-NTU Centre for Artificial Intelligence Research, Jinan, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"FCM: The fuzzy c-means clustering algorithm. Computers & geosciences","author":"Bezdek James C","year":"1984","unstructured":"James C Bezdek, Robert Ehrlich, and William Full. 1984. FCM: The fuzzy c-means clustering algorithm. Computers & geosciences, Vol. 10, 2-3 (1984), 191-203."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01267"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01553"},{"key":"e_1_3_2_1_4_1","volume-title":"Faisal Ahmed, Zhe Gan, Yu Cheng, and Jingjing Liu.","author":"Chen Yen-Chun","year":"2020","unstructured":"Yen-Chun Chen, Linjie Li, Licheng Yu, Ahmed El Kholy, Faisal Ahmed, Zhe Gan, Yu Cheng, and Jingjing Liu. 2020b. UNITER: UNiversal Image-TExt Representation Learning."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00831"},{"key":"e_1_3_2_1_6_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In In Proceedings of the 2019 Conference of the North American","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics. 4171-4186."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16209"},{"key":"e_1_3_2_1_8_1","volume-title":"Proceedings of The 29th British Machine Vision Conference (BMVC)","author":"Faghri Fartash","year":"2018","unstructured":"Fartash Faghri, David J Fleet, Jamie Ryan Kiros, and Sanja Fidler. 2018. VSE: Improving Visual-Semantic Embeddings with Hard Negatives. Proceedings of The 29th British Machine Vision Conference (BMVC) (2018)."},{"key":"e_1_3_2_1_9_1","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","volume":"26","author":"Frome Andrea","year":"2013","unstructured":"Andrea Frome, Greg S Corrado, Jon Shlens, Samy Bengio, Jeff Dean, Marc'Aurelio Ranzato, and Tomas Mikolov. 2013. Devise: A deep visual-semantic embedding model. Advances in Neural Information Processing Systems (NeurIPS), Vol. 26 (2013)."},{"key":"e_1_3_2_1_10_1","volume-title":"PAIR: Complementarity-guided Disentanglement for Composed Image Retrieval. In ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 1-5.","author":"Fu Zhiheng","year":"2025","unstructured":"Zhiheng Fu, Zixu Li, Zhiwei Chen, Chunxiao Wang, Xuemeng Song, Yupeng Hu, and Liqiang Nie. 2025. PAIR: Complementarity-guided Disentanglement for Composed Image Retrieval. In ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 1-5."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01455"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02485"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01068"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01278"},{"key":"e_1_3_2_1_15_1","volume-title":"International conference on machine learning. PMLR, 5583-5594","author":"Kim Wonjae","year":"2021","unstructured":"Wonjae Kim, Bokyung Son, and Ildoo Kim. 2021. Vilt: Vision-and-language transformer without convolution or region supervision. In International conference on machine learning. PMLR, 5583-5594."},{"key":"e_1_3_2_1_16_1","volume-title":"The 14th International Conference on Learning Representations","author":"Kolesnikov Alexander","year":"2021","unstructured":"Alexander Kolesnikov, Alexey Dosovitskiy, Dirk Weissenborn, Georg Heigold, Jakob Uszkoreit, Lucas Beyer, Matthias Minderer, Mostafa Dehghani, Neil Houlsby, Sylvain Gelly, Thomas Unterthiner, and Xiaohua Zhai. 2021. An image is worth 16x16 words: Transformers for image recognition at scale. The 14th International Conference on Learning Representations (2021)."},{"key":"e_1_3_2_1_17_1","first-page":"201","article-title":"Stacked cross attention for image-text matching","volume":"2018","author":"Lee Kuang-Huei","year":"2018","unstructured":"Kuang-Huei Lee, Xi Chen, Gang Hua, Houdong Hu, and Xiaodong He. 2018. Stacked cross attention for image-text matching. In Computer Vision - ECCV 2018. 201-216.","journal-title":"Computer Vision - ECCV"},{"key":"e_1_3_2_1_18_1","volume-title":"International conference on machine learning. PMLR, 12888-12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022b. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888-12900."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00475"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/TETCI.2019.2892755"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2022\/158"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01765"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_25_1","volume-title":"The 12th International Conference on Learning Representations","author":"Loshchilov Ilya","year":"2019","unstructured":"Ilya Loshchilov and Frank Hutter. 2019. Decoupled weight decay regularization. The 12th International Conference on Learning Representations (2019)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681424"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01847"},{"key":"e_1_3_2_1_28_1","volume-title":"International conference on machine learning. PMLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748-8763."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2577031"},{"key":"e_1_3_2_1_30_1","volume-title":"\u0141 ukasz Kaiser, and Illia Polosukhin","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141 ukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems (NeurIPS), I. Guyon, U. Von Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett (Eds.), Vol. 30. Curran Associates, Inc."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.541"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2020.3030656"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.309"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350940"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20202"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01521"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611703"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00359"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/IWQoS.2018.8624183"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755216","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:10:39Z","timestamp":1765339839000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755216"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":40,"alternative-id":["10.1145\/3746027.3755216","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755216","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}