{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T20:07:11Z","timestamp":1765310831003,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755585","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:30:51Z","timestamp":1761377451000},"page":"4748-4757","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Noise Self-Correction via Relation Propagation for Robust Cross-Modal Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-3316-2490","authenticated-orcid":false,"given":"Ruoxuan","family":"Li","sequence":"first","affiliation":[{"name":"Nanjing University of Science and Technology, Nanjing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-5689-3818","authenticated-orcid":false,"given":"Xiangyu","family":"Wu","sequence":"additional","affiliation":[{"name":"Nanjing University of Science and Technology, Nanjing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5245-3584","authenticated-orcid":false,"given":"Yang","family":"Yang","sequence":"additional","affiliation":[{"name":"Nanjing University of Science and Technology, NanJing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"1","article-title":"Cross-modal graph matching network for image-text retrieval","author":"Yuhao Cheng","year":"2022","unstructured":"Yuhao Cheng et al. ''Cross-modal graph matching network for image-text retrieval''. In: ACM Transactions on Multimedia Computing, Communications, and Applications (TOMM) (2022), pp. 1--23.","journal-title":"ACM Transactions on Multimedia Computing, Communications, and Applications (TOMM) ("},{"key":"e_1_3_2_1_2_1","first-page":"201","volume-title":"Proceedings of the European conference on computer vision (ECCV).","author":"Kuang-Huei","year":"2018","unstructured":"Kuang-Huei Lee et al. ''Stacked cross attention for image-text matching''. In: Proceedings of the European conference on computer vision (ECCV). 2018, pp. 201--216."},{"key":"e_1_3_2_1_3_1","first-page":"8415","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","author":"Sanghyuk","year":"2021","unstructured":"Sanghyuk Chun et al. ''Probabilistic embeddings for cross-modal retrieval''. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2021, pp. 8415--8424."},{"key":"e_1_3_2_1_4_1","first-page":"15692","volume-title":"Proceedings of the IEEE\/CVF conference on computer Vision and pattern recognition.","author":"Haoyu","year":"2022","unstructured":"Haoyu Lu et al. ''Cots: Collaborative two-stream vision-language pre-training model for cross-modal retrieval''. In: Proceedings of the IEEE\/CVF conference on computer Vision and pattern recognition. 2022, pp. 15692--15701."},{"key":"e_1_3_2_1_5_1","first-page":"1","volume-title":"2024 IEEE International Conference on Multimedia and Expo (ICME). IEEE.","author":"Fengqiang","year":"2024","unstructured":"Fengqiang Wan et al. ''Covlr: Coordinating cross-modal consistency and intramodal relations for vision-language retrieval''. In: 2024 IEEE International Conference on Multimedia and Expo (ICME). IEEE. 2024, pp. 1--6."},{"key":"e_1_3_2_1_6_1","first-page":"3300","article-title":"Rethinking Label-Wise Cross-Modal Retrieval from A Semantic Sharing Perspective","author":"Yang Yang","year":"2021","unstructured":"Yang Yang et al. ''Rethinking Label-Wise Cross-Modal Retrieval from A Semantic Sharing Perspective.'' In: IJCAI. 2021, pp. 3300--3306.","journal-title":"IJCAI."},{"key":"e_1_3_2_1_7_1","first-page":"2556","volume-title":"Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers).","author":"Piyush","year":"2018","unstructured":"Piyush Sharma et al. ''Conceptual captions: A cleaned, hypernymed, image alttext dataset for automatic image captioning''. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). 2018, pp. 2556--2565."},{"key":"e_1_3_2_1_8_1","first-page":"29406","volume-title":"Advances in Neural Information Processing Systems","author":"Zhenyu","year":"2021","unstructured":"Zhenyu Huang et al. ''Learning with noisy correspondence for cross-modal matching''. In: Advances in Neural Information Processing Systems (2021), pp. 29406--29419."},{"key":"e_1_3_2_1_9_1","first-page":"852","volume-title":"Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval.","author":"Quanxing","year":"2024","unstructured":"Quanxing Zha et al. ''Ugncl: Uncertainty-guided noisy correspondence learning for efficient cross-modal matching''. In: Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval. 2024, pp. 852--861."},{"key":"e_1_3_2_1_10_1","first-page":"3884","article-title":"Learning from noisy correspondence with tri-partition for cross-modal matching","author":"Zerun Feng","year":"2023","unstructured":"Zerun Feng et al. ''Learning from noisy correspondence with tri-partition for cross-modal matching''. In: IEEE Transactions on Multimedia (2023), pp. 3884--3896.","journal-title":"IEEE Transactions on Multimedia ("},{"key":"e_1_3_2_1_11_1","first-page":"7341","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","author":"Zhang Xu","year":"2024","unstructured":"Xu Zhang, Hao Li, and Mang Ye. ''Negative Pre-aware for Noisy Cross-Modal Matching''. In: Proceedings of the AAAI Conference on Artificial Intelligence (2024), pp. 7341--7349."},{"key":"e_1_3_2_1_12_1","first-page":"19883","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","author":"Shuo","year":"2023","unstructured":"Shuo Yang et al. ''Bicro: Noisy correspondence rectification for multi-modality data via bi-directional cross-modal similarity consistency''. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2023, pp. 19883--19892."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3247939"},{"key":"e_1_3_2_1_14_1","first-page":"12091","volume-title":"Proceedings of the AAAI conference on artificial intelligence.","volume":"38","author":"Zhongtian","year":"2024","unstructured":"Zhongtian Fu et al. ''Noise-aware image captioning with progressively exploring mismatched words''. In: Proceedings of the AAAI conference on artificial intelligence. Vol. 38. 11. 2024, pp. 12091--12099."},{"key":"e_1_3_2_1_15_1","first-page":"7517","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","author":"Haochen","year":"2023","unstructured":"Haochen Han et al. ''Noisy correspondence learning with meta similarity correction''. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2023, pp. 7517--7526."},{"key":"e_1_3_2_1_16_1","first-page":"2587","article-title":"Cross-modal retrieval with noisy correspondence via consistency refining and mining","author":"Xinran Ma","year":"2024","unstructured":"Xinran Ma et al. ''Cross-modal retrieval with noisy correspondence via consistency refining and mining''. In: IEEE Transactions on Image Processing (2024), pp. 2587--2598.","journal-title":"IEEE Transactions on Image Processing ("},{"key":"e_1_3_2_1_17_1","article-title":"Rebalanced Vision-Language Retrieval Considering Structure- Aware Distillation","author":"Yang Yang","year":"2024","unstructured":"Yang Yang et al. ''Rebalanced Vision-Language Retrieval Considering Structure- Aware Distillation''. In: IEEE Transactions on Image Processing (2024).","journal-title":"IEEE Transactions on Image Processing ("},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11704-023-3186-6"},{"key":"e_1_3_2_1_19_1","volume-title":"Vse: Improving visual-semantic embeddings with hard negatives''. In: arXiv preprint arXiv:1707.05612","author":"Fartash Faghri","year":"2017","unstructured":"Fartash Faghri et al. ''Vse: Improving visual-semantic embeddings with hard negatives''. In: arXiv preprint arXiv:1707.05612 (2017)."},{"key":"e_1_3_2_1_20_1","first-page":"4654","volume-title":"Proceedings of the IEEE\/CVF international conference on computer vision.","author":"Kunpeng","year":"2019","unstructured":"Kunpeng Li et al. ''Visual semantic reasoning for image-text matching''. In: Proceedings of the IEEE\/CVF international conference on computer vision. 2019, pp. 4654--4662."},{"key":"e_1_3_2_1_21_1","first-page":"2440","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence.","author":"Shengsheng","year":"2021","unstructured":"Shengsheng Qian et al. ''Dual adversarial graph neural networks for multi-label cross-modal retrieval''. In: Proceedings of the AAAI Conference on Artificial Intelligence. 2021, pp. 2440--2448."},{"key":"e_1_3_2_1_22_1","first-page":"1218","volume-title":"Proceedings of the AAAI conference on artificial intelligence.","author":"Haiwen","year":"2021","unstructured":"Haiwen Diao et al. ''Similarity reasoning and filtration for image-text matching''. In: Proceedings of the AAAI conference on artificial intelligence. 2021, pp. 1218--1226."},{"key":"e_1_3_2_1_23_1","first-page":"1865","volume-title":"Proceedings of the 44th international ACM SIGIR conference on research and development in information retrieval.","author":"Yi","year":"2021","unstructured":"Yi He et al. ''Cross-graph attention enhanced multi-modal correlation learning for fine-grained image-text retrieval''. In: Proceedings of the 44th international ACM SIGIR conference on research and development in information retrieval. 2021, pp. 1865--1869."},{"key":"e_1_3_2_1_24_1","first-page":"19275","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","author":"Pan Zhengxin","year":"2023","unstructured":"Zhengxin Pan, Fangyu Wu, and Bailing Zhang. ''Fine-grained image-text matching by cross-modal hard aligning network''. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2023, pp. 19275--19284."},{"key":"e_1_3_2_1_25_1","first-page":"15661","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","author":"Kun","year":"2022","unstructured":"Kun Zhang et al. ''Negative-aware attention framework for image-text matching''. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2022, pp. 15661--15670."},{"key":"e_1_3_2_1_26_1","first-page":"4171","volume-title":"Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies","volume":"1","author":"Jacob","year":"2019","unstructured":"Jacob Devlin et al. ''Bert: Pre-training of deep bidirectional transformers for language understanding''. In: Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers). 2019, pp. 4171--4186."},{"key":"e_1_3_2_1_27_1","volume-title":"An image is worth 16x16 words: Transformers for image recognition at scale''. In: arXiv preprint arXiv:2010.11929","author":"Alexey Dosovitskiy","year":"2020","unstructured":"Alexey Dosovitskiy et al. ''An image is worth 16x16 words: Transformers for image recognition at scale''. In: arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_28_1","first-page":"8748","volume-title":"International conference on machine learning. PmLR.","author":"Alec","year":"2021","unstructured":"Alec Radford et al. ''Learning transferable visual models from natural language supervision''. In: International conference on machine learning. PmLR. 2021, pp. 8748--8763."},{"key":"e_1_3_2_1_29_1","first-page":"2787","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","author":"Jiang Ding","year":"2023","unstructured":"Ding Jiang and Mang Ye. ''Cross-modal implicit relation reasoning and aligning for text-to-image person retrieval''. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2023, pp. 2787--2797."},{"key":"e_1_3_2_1_30_1","first-page":"23209","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","author":"Kalantidis Yannis","year":"2024","unstructured":"Yannis Kalantidis, Giorgos Tolias, et al. ''Label propagation for zero-shot classification with vision-language models''. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2024, pp. 23209--23218."},{"key":"e_1_3_2_1_31_1","volume-title":"LPOSS: Label Propagation Over Patches and Pixels for Open-vocabulary Semantic Segmentation''. In: arXiv preprint arXiv:2503.19777","author":"Vladan","year":"2025","unstructured":"Vladan Stojni? et al. ''LPOSS: Label Propagation Over Patches and Pixels for Open-vocabulary Semantic Segmentation''. In: arXiv preprint arXiv:2503.19777 (2025)."},{"key":"e_1_3_2_1_32_1","first-page":"14308","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","author":"Mouxing","year":"2022","unstructured":"Mouxing Yang et al. ''Learning with twin noisy labels for visible-infrared person re-identification''. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2022, pp. 14308--14317."},{"key":"e_1_3_2_1_33_1","first-page":"9397","volume-title":"Proceedings of the 32nd ACM International Conference on Multimedia.","author":"Yue","year":"2024","unstructured":"Yue Duan et al. ''PC2: Pseudo-Classification Based Pseudo-Captioning for Noisy Correspondence Learning in Cross-Modal Retrieval''. In: Proceedings of the 32nd ACM International Conference on Multimedia. 2024, pp. 9397--9406."},{"key":"e_1_3_2_1_34_1","first-page":"26679","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","author":"Haochen","year":"2024","unstructured":"Haochen Han et al. ''Learning to rematch mismatched pairs for robust crossmodal retrieval''. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2024, pp. 26679--26688."},{"key":"e_1_3_2_1_35_1","first-page":"17700","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","author":"Yuchen","year":"2024","unstructured":"Yuchen Yang et al. ''Robust noisy correspondence learning with equivariant similarity consistency''. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2024, pp. 17700--17709."},{"key":"e_1_3_2_1_36_1","first-page":"27381","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","author":"Zihua","year":"2024","unstructured":"Zihua Zhao et al. ''Mitigating noisy correspondence by geometrical structure consistency learning''. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2024, pp. 27381--27390."},{"key":"e_1_3_2_1_37_1","volume-title":"ReCon: Enhancing True Correspondence Discrimination through Relation Consistency for Robust Noisy Correspondence Learning''. In: arXiv preprint arXiv:2502.19962","author":"Quanxing Zha","year":"2025","unstructured":"Quanxing Zha et al. ''ReCon: Enhancing True Correspondence Discrimination through Relation Consistency for Robust Noisy Correspondence Learning''. In: arXiv preprint arXiv:2502.19962 (2025)."},{"key":"e_1_3_2_1_38_1","first-page":"4948","volume-title":"Proceedings of the 30th ACM International Conference on Multimedia.","author":"Yang","year":"2022","unstructured":"Yang Qin et al. ''Deep evidential learning with noisy correspondence for crossmodal retrieval''. In: Proceedings of the 30th ACM International Conference on Multimedia. 2022, pp. 4948--4956."},{"key":"e_1_3_2_1_39_1","first-page":"24829","volume-title":"Advances in Neural Information Processing Systems","author":"Yang","year":"2023","unstructured":"Yang Qin et al. ''Cross-modal active complementary learning with self-refining correspondence''. In: Advances in Neural Information Processing Systems (2023), pp. 24829--24840."},{"key":"e_1_3_2_1_40_1","first-page":"5070","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","author":"Ahmet","year":"2019","unstructured":"Ahmet Iscen et al. ''Label propagation for deep semi-supervised learning''. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2019, pp. 5070--5079."},{"key":"e_1_3_2_1_41_1","first-page":"13706","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","author":"Xu Xun","year":"2020","unstructured":"Xun Xu and Gim Hee Lee. ''Weakly supervised semantic point cloud segmentation: Towards 10x fewer labels''. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2020, pp. 13706--13715."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02298"},{"key":"e_1_3_2_1_43_1","first-page":"2994","volume-title":"Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision.","author":"Xuefeng","year":"2024","unstructured":"Xuefeng Hu et al. ''Reclip: Refine contrastive language image pre-training with source free domain adaptation''. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision. 2024, pp. 2994--3003."},{"key":"e_1_3_2_1_44_1","volume-title":"Efficient and Context-Aware Label Propagation for Zero-\/Few-Shot Training-Free Adaptation of Vision-Language Model''. In: arXiv preprint arXiv:2412.18303","author":"Yushu Li","year":"2024","unstructured":"Yushu Li et al. ''Efficient and Context-Aware Label Propagation for Zero-\/Few-Shot Training-Free Adaptation of Vision-Language Model''. In: arXiv preprint arXiv:2412.18303 (2024)."},{"key":"e_1_3_2_1_45_1","volume-title":"Cobit: A contrastive bi-directional image-text generation model''. In: arXiv preprint arXiv:2303.13455","author":"Haoxuan You","year":"2023","unstructured":"Haoxuan You et al. ''Cobit: A contrastive bi-directional image-text generation model''. In: arXiv preprint arXiv:2303.13455 (2023)."},{"key":"e_1_3_2_1_46_1","volume-title":"Representation learning with contrastive predictive coding''. In: arXiv preprint arXiv:1807.03748","author":"van den Oord Aaron","year":"2018","unstructured":"Aaron van den Oord, Yazhe Li, and Oriol Vinyals. ''Representation learning with contrastive predictive coding''. In: arXiv preprint arXiv:1807.03748 (2018)."},{"key":"e_1_3_2_1_47_1","first-page":"2077","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition.","author":"Ahmet","year":"2017","unstructured":"Ahmet Iscen et al. ''Efficient diffusion on region manifolds: Recovering small objects with compact cnn representations''. In: Proceedings of the IEEE conference on computer vision and pattern recognition. 2017, pp. 2077--2086."},{"key":"e_1_3_2_1_48_1","first-page":"17612","article-title":"Mind the gap: Understanding the modality gap in multimodal contrastive representation learning","volume":"35","author":"Liang Victor Weixin","year":"2022","unstructured":"Victor Weixin Liang et al. ''Mind the gap: Understanding the modality gap in multimodal contrastive representation learning''. In: Advances in Neural Information Processing Systems 35 (2022), pp. 17612--17625.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_49_1","first-page":"9729","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","author":"Kaiming","year":"2020","unstructured":"Kaiming He et al. ''Momentum contrast for unsupervised visual representation learning''. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2020, pp. 9729--9738."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"e_1_3_2_1_51_1","first-page":"740","volume-title":"European conference on computer vision.","author":"Tsung-Yi","year":"2014","unstructured":"Tsung-Yi Lin et al. ''Microsoft coco: Common objects in context''. In: European conference on computer vision. 2014, pp. 740--755."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_2_1_53_1","volume-title":"The Twelfth International Conference on Learning Representations, ICLR 2024","author":"Chun Sanghyuk","year":"2024","unstructured":"Sanghyuk Chun. ''Improved Probabilistic Image-Text Representations''. In: The Twelfth International Conference on Learning Representations, ICLR 2024, Vienna, Austria, May 7--11, 2024. 2024."},{"key":"e_1_3_2_1_54_1","first-page":"24564","volume-title":"Advances in Neural Information Processing Systems","author":"Hao","year":"2023","unstructured":"Hao Li et al. ''Prototype-based aleatoric uncertainty quantification for crossmodal retrieval''. In: Advances in Neural Information Processing Systems (2023), pp. 24564--24585."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755585","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T20:04:17Z","timestamp":1765310657000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755585"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":54,"alternative-id":["10.1145\/3746027.3755585","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755585","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}