{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T09:11:32Z","timestamp":1774602692523,"version":"3.50.1"},"reference-count":70,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2026,2,21]],"date-time":"2026-02-21T00:00:00Z","timestamp":1771632000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,2,21]],"date-time":"2026-02-21T00:00:00Z","timestamp":1771632000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1007\/s11263-025-02686-y","type":"journal-article","created":{"date-parts":[[2026,2,21]],"date-time":"2026-02-21T07:39:18Z","timestamp":1771659558000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Fine-Grained Multimodal Alignment for Image-Text Retrieval via Graph Learning"],"prefix":"10.1007","volume":"134","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4378-6731","authenticated-orcid":false,"given":"Mao","family":"Chen","sequence":"first","affiliation":[]},{"given":"Xiangkai","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Lu","family":"Qi","sequence":"additional","affiliation":[]},{"given":"Xiangtai","family":"Li","sequence":"additional","affiliation":[]},{"given":"Xu","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Steven C. H.","family":"Hoi","sequence":"additional","affiliation":[]},{"given":"Zhiyong","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Ming-Hsuan","family":"Yang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,2,21]]},"reference":[{"key":"2686_CR1","unstructured":"Brody, S., Alon, U., & Yahav, E. (2022). How attentive are graph attention networks? Proceedings of the international conference on learning representations."},{"key":"2686_CR2","doi-asserted-by":"crossref","unstructured":"Chen, H., Ding, G., Liu, X., Lin, Z., Liu, J., & Han, J. (2020a). Imram: Iterative matching with recurrent attention memory for cross-modal image-text retrieval. Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, (pp 12655\u201312663)","DOI":"10.1109\/CVPR42600.2020.01267"},{"key":"2686_CR3","doi-asserted-by":"crossref","unstructured":"Chen, H., Luo, Z., Zhang, J., Zhou, L., Bai, X., Hu, Z., Tai, C.L., & Quan, L. (2021). Learning to match features with seeded graph matching network. Proceedings of the IEEE\/CVF international conference on computer vision (pp. 6301\u20136310)","DOI":"10.1109\/ICCV48922.2021.00624"},{"key":"2686_CR4","unstructured":"Chen, L., Gan, Z., Cheng, Y., Li, L., Carin, L., & Liu, J. (2020b). Graph optimal transport for cross-domain alignment. Proceedings of the international conference on machine learning, (pp 1542\u20131553)"},{"key":"2686_CR5","doi-asserted-by":"crossref","unstructured":"Chen, Y. C., Li, L., Yu, L., El Kholy, A., Ahmed, F., Gan, Z., Cheng, Y., & Liu, J. (2020c). Uniter: Universal image-text representation learning. Proceedings of the european conference on computer vision, (pp 104\u2013120)","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"2686_CR6","doi-asserted-by":"crossref","unstructured":"Chen, Z., Xu, C., Qi, Y., & Guo, J.(2024). Mllm is a strong reranker: Advancing multimodal retrieval-augmented generation via knowledge-enhanced reranking and noise-injected training. arXiv:2407.21439","DOI":"10.18653\/v1\/2025.findings-emnlp.432"},{"key":"2686_CR7","doi-asserted-by":"crossref","unstructured":"Cheng, M., Sun, Y., Wang, L., Zhu, X., Yao, K., Chen, J., Song, G., Han, J., Liu, J., Ding, E., & Wang, J. (2022). Vista: vision and scene text aggregation for cross-modal retrieval. Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 5184\u20135193).","DOI":"10.1109\/CVPR52688.2022.00512"},{"key":"2686_CR8","doi-asserted-by":"crossref","unstructured":"Cho, M., Lee, J., & Lee, K. M. (2010). Reweighted random walks for graph matching. Proceedings of the european conference on computer vision (pp. 492\u2013505).","DOI":"10.1007\/978-3-642-15555-0_36"},{"key":"2686_CR9","unstructured":"Cuturi, M. (2013). Sinkhorn distances: Lightspeed computation of optimal transport. Advances in neural information processing systems, (pp 2292\u20132300)."},{"key":"2686_CR10","doi-asserted-by":"publisher","first-page":"3341","DOI":"10.1109\/TIP.2024.3396063","volume":"33","author":"H Diao","year":"2024","unstructured":"Diao, H., Zhang, Y., Gao, S., et al. (2024). Deep boosting learning: A brand-new cooperative approach for image-text matching. IEEE Transactions on Image Processing,33, 3341\u20133352.","journal-title":"IEEE Transactions on Image Processing"},{"key":"2686_CR11","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., Uszkoreit, J., & Houlsby, N. (2020). An image is worth 16x16 words: Transformers for image recognition at scale. Proceedings of the international conference on learning representations."},{"issue":"9","key":"2686_CR12","doi-asserted-by":"publisher","first-page":"1774","DOI":"10.1109\/TPAMI.2015.2501802","volume":"38","author":"F Zhou","year":"2016","unstructured":"Zhou, F., & De la Torre, F. (2016). Factorized graph matching. IEEE Transactions on Pattern Analysis and Machine Intelligence,38(9), 1774\u20131789.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2686_CR13","unstructured":"Faghri, F., Fleet, D. J., Kiros, J. R., & Fidler, S. (2017). Vse++: Improving visual-semantic embeddings with hard negatives. Proceedings of the british machine vision conference."},{"key":"2686_CR14","doi-asserted-by":"publisher","first-page":"105171","DOI":"10.1016\/j.imavis.2024.105171","volume":"149","author":"Y Fang","year":"2024","unstructured":"Fang, Y., Sun, Q., Wang, X., Huang, T., Wang, X., & Cao, Y. (2024). Eva-02: A visual representation for neon genesis. Image and Vision Computing,149, 105171.","journal-title":"Image and Vision Computing"},{"key":"2686_CR15","unstructured":"Frome, A., Corrado, G. S., Shlens, J., Bengio, S., Dean, J., Ranzato, M. A., & Mikolov, T. (2013). Devise: A deep visual-semantic embedding model. Advances in neural information processing systems, (pp 2121\u20132129)"},{"key":"2686_CR16","doi-asserted-by":"crossref","unstructured":"Fu, K., Liu, S., Luo, X., & Wang, M. (2021). Robust point cloud registration framework based on deep graph matching. Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 8893\u20138902).","DOI":"10.1109\/CVPR46437.2021.00878"},{"key":"2686_CR17","doi-asserted-by":"crossref","unstructured":"Fu, Z., Mao, Z., Song, Y., & Zhang, Y. (2023). Learning semantic relationship among instances for image-text matching. Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 15159\u201315168).","DOI":"10.1109\/CVPR52729.2023.01455"},{"key":"2686_CR18","doi-asserted-by":"crossref","unstructured":"Fu, Z., Zhang, L., Xia, H., & Mao, Z. (2024). Linguistic-aware patch slimming framework for fine-grained cross-modal alignment. Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 26307\u201326316).","DOI":"10.1109\/CVPR52733.2024.02485"},{"key":"2686_CR19","unstructured":"Gan, Z., Chen, Y. C., Li, L., Zhu, C., Cheng, Y., & Liu, J. (2020). Large-scale adversarial training for vision-and-language representation learning. Advances in neural information processing systems, (pp 6616\u20136628)."},{"key":"2686_CR20","doi-asserted-by":"crossref","unstructured":"Gong, Z., Mai, C., & Huang, Y. (2024). Ascl: An asymmetry-sensitive contrastive learning method for image-text retrieval with cross-modal fusion. IEEE international conference on multimedia and expo (pp. 1\u20136).","DOI":"10.1109\/ICME57554.2024.10687993"},{"key":"2686_CR21","unstructured":"Hamilton, W., Ying, Z., & Leskovec, J. (2017). Inductive representation learning on large graphs. Advances in neural information processing systems."},{"key":"2686_CR22","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., & Girshick, R. (2020). Momentum contrast for unsupervised visual representation learning. Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 9729\u20139738).","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"2686_CR23","doi-asserted-by":"crossref","unstructured":"Izquierdo, S., & Civera, J. (2024). Optimal transport aggregation for visual place recognition. Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 17658\u201317668).","DOI":"10.1109\/CVPR52733.2024.01672"},{"key":"2686_CR24","unstructured":"Jia, C., Yang, Y., Xia, Y., Chen, Y.-T., Parekh, Z., Pham, H., Le, Q., Sung, Y.-H., Li, Z., & Duerig, T. (2021). Scaling up visual and vision-language representation learning with noisy text supervision. Proceedings of the international conference on machine learning (pp. 4904\u20134916)."},{"issue":"1","key":"2686_CR25","doi-asserted-by":"publisher","first-page":"641","DOI":"10.1109\/TPAMI.2022.3148470","volume":"45","author":"K Li","year":"2023","unstructured":"Li, K., Zhang, Y., Li, K., Li, Y., & Fu, Y. (2023). Image-text embedding learning via visual and textual semantic reasoning. IEEE Transactions on Pattern Analysis and Machine Intelligence,45(1), 641\u2013656.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2686_CR26","unstructured":"Karpathy, A., Joulin, A., Fei-Fei, L.F. (2014). Deep fragment embeddings for bidirectional image sentence mapping. Advances in neural information processing systems, (pp 1889\u20131897)."},{"key":"2686_CR27","doi-asserted-by":"crossref","unstructured":"Khan, Z., Vijay Kumar, B., Yu, X., Schulter, S., Chandraker, M., & Fu, Y. (2022). Single-stream multi-level alignment for vision-language pretraining. Proceedings of the european conference on computer vision (pp. 735\u2013751).","DOI":"10.1007\/978-3-031-20059-5_42"},{"key":"2686_CR28","unstructured":"Kiros, R., Salakhutdinov, R., Zemel, R.S. (2014). Unifying visual-semantic embeddings with multimodal neural language models. arXiv:1411.2539"},{"issue":"4","key":"2686_CR29","doi-asserted-by":"publisher","first-page":"586","DOI":"10.1287\/mnsc.9.4.586","volume":"9","author":"EL Lawler","year":"1963","unstructured":"Lawler, E. L. (1963). The quadratic assignment problem. Management Science,9(4), 586\u2013599.","journal-title":"Management Science"},{"key":"2686_CR30","doi-asserted-by":"crossref","unstructured":"Lee, K. H., Chen, X., Hua, G., Hu, H., & He, X. (2018). Stacked cross attention for image-text matching. Proceedings of the european conference on computer vision (pp. 201\u2013216).","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"2686_CR31","doi-asserted-by":"crossref","unstructured":"Leordeanu, M., & Hebert, M. (2005). A spectral technique for correspondence problems using pairwise constraints. Proceedings of the IEEE\/CVF international conference on computer vision (pp. 1482\u20131489).","DOI":"10.1109\/ICCV.2005.20"},{"key":"2686_CR32","doi-asserted-by":"crossref","unstructured":"Leordeanu, M., Zanfir, A., & Sminchisescu, C. (2013). Locally affine sparse-to-dense matching for motion and occlusion estimation. Proceedings of the IEEE\/CVF international conference on computer vision (pp. 1721\u20131728).","DOI":"10.1109\/ICCV.2013.216"},{"key":"2686_CR33","unstructured":"Lewis, P., Perez, E., Piktus, A., Petroni, F., Karpukhin, V., Goyal, N., K\u00fcttler, H., Lewis, M., Yih, W., & Rockt\u00e4schel, T. and others. (2020). Retrieval-augmented generation for knowledge-intensive nlp tasks. Advances in Neural Information Processing Systems,33, 9459\u20139474."},{"key":"2686_CR34","doi-asserted-by":"crossref","unstructured":"Li, G., Duan, N., Fang, Y., Gong, M., & Jiang, D. (2020). Unicoder-vl: A universal encoder for vision and language by cross-modal pre-training. Proceedings of the AAAI conference on artificial intelligence (pp. 11336\u201311344).","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"2686_CR35","unstructured":"Li, J., Selvaraju, R., Gotmare, A., Joty, S., Xiong, C., & Hoi, S. C. H. (2021). Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems, (pp 9694\u20139705)."},{"key":"2686_CR36","unstructured":"Li, J., Li, D., Xiong, C., & Hoi, S. (2022). BLIP: Bootstrapping language-image pre-training for unified vision-language understanding and generation. Proceedings of the international conference on machine learning (pp. 12888\u201312900)."},{"key":"2686_CR37","unstructured":"Li, J., Li, D., Savarese, S., & Hoi, S. (2023). Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. Proceedings of the international conference on machine learning (pp. 19730\u201319742)."},{"key":"2686_CR38","doi-asserted-by":"crossref","unstructured":"Li, K., Zhang, Y., Li, K., Li, Y., & Fu, Y. (2019). Visual semantic reasoning for image-text matching. Proceedings of the IEEE\/CVF international conference on computer vision (pp. 4654\u20134662).","DOI":"10.1109\/ICCV.2019.00475"},{"key":"2686_CR39","doi-asserted-by":"crossref","unstructured":"Lin, T. Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., & Zitnick, C. L. (2014). Microsoft coco: Common objects in context. Proceedings of the european conference on computer vision (pp. 740\u2013755).","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"2686_CR40","doi-asserted-by":"crossref","unstructured":"Lin, W., Chen, J., Mei, J., Coca, A., & Byrne, B. (2023) Fine-grained late-interaction multi-modal retrieval for retrieval augmented visual question answering. Advances in neural information processing systems, (pp 22820\u201322840)","DOI":"10.52202\/075280-0990"},{"key":"2686_CR41","doi-asserted-by":"crossref","unstructured":"Liu, C., Mao, Z., Zhang, T., Xie, H., Wang, B., & Zhang, Y. (2020). Graph structured network for image-text matching. Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 10921\u201310930).","DOI":"10.1109\/CVPR42600.2020.01093"},{"key":"2686_CR42","doi-asserted-by":"crossref","unstructured":"Lu, H., Fei, N., Huo, Y., Gao, Y., Lu, Z., & Wen, J. R. (2022). Cots: Collaborative two-stream vision-language pre-training model for cross-modal retrieval. Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 15692\u201315701).","DOI":"10.1109\/CVPR52688.2022.01524"},{"key":"2686_CR43","doi-asserted-by":"publisher","first-page":"26506","DOI":"10.1109\/ACCESS.2025.3539926","volume":"13","author":"L Manella Pereira","year":"2025","unstructured":"Manella Pereira, L., & Hadi Amini, M. (2025). A survey on optimal transport for machine learning: Theory and applications. IEEE Access,13, 26506\u201326526.","journal-title":"IEEE Access"},{"key":"2686_CR44","doi-asserted-by":"crossref","unstructured":"Mayer, C., Danelljan, M., Paudel, D. P., & Van Gool, L. (2021). Learning target candidate association to keep track of what not to track. Proceedings of the IEEE\/CVF international conference on computer vision (pp. 13444\u201313454).","DOI":"10.1109\/ICCV48922.2021.01319"},{"issue":"12","key":"2686_CR45","doi-asserted-by":"publisher","first-page":"2262","DOI":"10.1109\/TPAMI.2010.46","volume":"32","author":"A Myronenko","year":"2010","unstructured":"Myronenko, A., & Song, X. (2010). Point set registration: Coherent point drift. IEEE Transactions on Pattern Analysis and Machine Intelligence,32(12), 2262\u20132275.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2686_CR46","doi-asserted-by":"crossref","unstructured":"Pan, Z., Wu, F., & Zhang, B. (2023). Fine-grained image-text matching by cross-modal hard aligning network. Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 19275\u201319284).","DOI":"10.1109\/CVPR52729.2023.01847"},{"issue":"5\u20136","key":"2686_CR47","first-page":"355","volume":"11","author":"G Peyr\u00e9","year":"2019","unstructured":"Peyr\u00e9, G., & Cuturi, M. (2019). Computational optimal transport: With applications to data science. Foundations and trends\u00ae. Machine Learning,11(5\u20136), 355\u2013607.","journal-title":"Machine Learning"},{"key":"2686_CR48","unstructured":"Peyr\u00e9, G., Cuturi, M., & Solomon, J. (2016). Gromov-wasserstein averaging of kernel and distance matrices. Proceedings of the international conference on machine learning (pp. 2664\u20132672)."},{"key":"2686_CR49","doi-asserted-by":"crossref","unstructured":"Plummer, B. A., Wang, L., Cervantes, C. M., Caicedo, J. C., Hockenmaier, J., & Lazebnik, S. (2015). Flickr30k entities: Collecting region-to-phrase correspondences for richer image-to-sentence models. Proceedings of the IEEE\/CVF international conference on computer vision (pp. 2641\u20132649).","DOI":"10.1109\/ICCV.2015.303"},{"key":"2686_CR50","doi-asserted-by":"crossref","unstructured":"Plummer, B. A., Wang, L., Cervantes, C. M., Caicedo, J. C., Hockenmaier, J., & Lazebnik, S. (2015). Flickr30k entities: Collecting region-to-phrase correspondences for richer image-to-sentence models. Proceedings of the IEEE\/CVF international conference on computer vision (pp. 2641\u20132649).","DOI":"10.1109\/ICCV.2015.303"},{"key":"2686_CR51","doi-asserted-by":"crossref","unstructured":"Qu, J., Ling, H., Zhang, C., Lyu, X., & Tang, Z. (2021a). Adaptive edge attention for graph matching with outliers. Proceedings of the international joint conference on artificial intelligence, (pp 966\u2013972)","DOI":"10.24963\/ijcai.2021\/134"},{"key":"2686_CR52","doi-asserted-by":"crossref","unstructured":"Qu, L., Liu, M., Wu, J., Gao, Z., & Nie, L. (2021b). Dynamic modality interaction modeling for image-text retrieval. Proceedings of the ACM SIGIR conference on research and development in information retrieval, (pp 1104\u20131113)","DOI":"10.1145\/3404835.3462829"},{"key":"2686_CR53","unstructured":"Radford, A., Kim, J. W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., Krueger, G., & Sutskever, I. (2021). Learning transferable visual models from natural language supervision. Proceedings of the International Conference on Machine Learning (pp. 8748\u20138763)."},{"key":"2686_CR54","doi-asserted-by":"crossref","unstructured":"Sarlin, P. E., DeTone, D., Malisiewicz, T., & Rabinovich, A. (2020). SuperGlue: Learning feature matching with graph neural networks. Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 4938\u20134947).","DOI":"10.1109\/CVPR42600.2020.00499"},{"key":"2686_CR55","doi-asserted-by":"crossref","unstructured":"Selvaraju, R. R., Cogswell, M., Das, A., Vedantam, R., Parikh, D., & Batra, D. (2017). Grad-cam: Visual explanations from deep networks via gradient-based localization. Proceedings of the IEEE\/CVF international conference on computer vision (pp. 618\u2013626).","DOI":"10.1109\/ICCV.2017.74"},{"key":"2686_CR56","unstructured":"Tschannen, M., Gritsenko, A., Wang, X., Naeem, M. F., Alabdulmohsin, I., Parthasarathy, N., Evans, T., Beyer, L., Xia, Y., & Mustafa, B. and others. (2025) Siglip 2: Multilingual vision-language encoders with improved semantic understanding, localization, and dense features. arXiv:2502.14786"},{"key":"2686_CR57","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., Kaiser, \u0141., & Polosukhin, I. (2017). Attention is all you need. Advances in neural information processing systems, (pp 6000\u20136010)."},{"key":"2686_CR58","doi-asserted-by":"crossref","unstructured":"Wang, R., Yan, J., & Yang, X. (2019). Learning combinatorial embedding networks for deep graph matching. Proceedings of the IEEE\/CVF international conference on computer vision (pp. 3056\u20133065).","DOI":"10.1109\/ICCV.2019.00315"},{"key":"2686_CR59","doi-asserted-by":"crossref","unstructured":"Wang, W., Bao, H., Dong, L., Bjorck, J., Peng, Z., Liu, Q.,Aggarwal, K., Mohammed, O.K., Singhal, S., Som, S. and others. (2023). Image as a foreign language: Beit pretraining for vision and vision-language tasks. Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 19175\u201319186).","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"2686_CR60","doi-asserted-by":"crossref","unstructured":"Wei, X., Zhang, T., Li, Y., Zhang, Y., & Wu, F. (2020). Multi-modality cross attention network for image and sentence matching. Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 10941\u201310950).","DOI":"10.1109\/CVPR42600.2020.01095"},{"key":"2686_CR61","doi-asserted-by":"crossref","unstructured":"Wen, K., Xia, J., Huang, Y., Li, L., Xu, J., & Shao, J. (2021). Cookie: Contrastive cross-modal knowledge sharing pre-training for vision-language representation. Proceedings of the IEEE\/CVF international conference on computer vision (pp. 2208\u20132217).","DOI":"10.1109\/ICCV48922.2021.00221"},{"issue":"12","key":"2686_CR62","doi-asserted-by":"publisher","first-page":"5412","DOI":"10.1109\/TNNLS.2020.2967597","volume":"31","author":"X Xu","year":"2020","unstructured":"Xu, X., Wang, T., Yang, Y., Zuo, L., Shen, F., & Shen, H. T. (2020). Cross-modal attention with semantic consistence for image-text matching. IEEE Transactions on Neural Networks and Learning Systems,31(12), 5412\u20135425.","journal-title":"IEEE Transactions on Neural Networks and Learning Systems"},{"issue":"8","key":"2686_CR63","doi-asserted-by":"publisher","first-page":"1809","DOI":"10.1109\/TPAMI.2019.2903483","volume":"42","author":"X Yang","year":"2020","unstructured":"Yang, X., Liu, Z. Y., & Qiao, H. (2020). A continuation method for graph matching based feature correspondence. IEEE Transactions on Pattern Analysis and Machine Intelligence,42(8), 1809\u20131822.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2686_CR64","unstructured":"Yao, L., Huang, R., Hou, L., Lu, G., Niu, M., Xu, H., Liang, X., Li, Z., Zhenguo, J., Jiang, X., & Xu, C. (2021) FILIP: Fine-grained interactive language-image pre-training. arXiv:2111.07783"},{"key":"2686_CR65","doi-asserted-by":"crossref","unstructured":"Yu, F., Tang, J., Yin, W., Sun, Y., Tian, H., Wu, H., & Wang, H. (2021). ERNIE-ViL: Knowledge enhanced vision-language representations through scene graphs. Proceedings of the AAAI conference on artificial intelligence (pp. 3208\u20133216).","DOI":"10.1609\/aaai.v35i4.16431"},{"key":"2686_CR66","doi-asserted-by":"crossref","unstructured":"Yu, T., Yang, Y., Li, Y., Liu, L., Fei, H., & Li, P. (2021b). Heterogeneous attention network for effective and efficient cross-modal retrieval. Proceedings of the ACM SIGIR conference on research and development in information retrieval, (pp 1146\u20131156).","DOI":"10.1145\/3404835.3462924"},{"key":"2686_CR67","unstructured":"Yuan, L., Chen, D., Chen, Y. L., Codella, N., Dai, X., Gao, J., Hu, H., Huang, X., Li, B., & Li, C. and others.(2021) Florence: A new foundation model for computer vision. arXiv:2111.11432"},{"issue":"12","key":"2686_CR68","doi-asserted-by":"publisher","first-page":"2227","DOI":"10.1109\/TPAMI.2008.245","volume":"31","author":"M Zaslavskiy","year":"2009","unstructured":"Zaslavskiy, M., Bach, F., & Vert, J. P. (2009). A path following algorithm for the graph matching problem. IEEE Transactions on Pattern Analysis and Machine Intelligence,31(12), 2227\u20132242.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"issue":"7","key":"2686_CR69","doi-asserted-by":"publisher","first-page":"1681","DOI":"10.1109\/TMM.2018.2888822","volume":"21","author":"Z Zhang","year":"2019","unstructured":"Zhang, Z., Wu, Q., Wang, Y., & Chen, F. (2019). High-quality image captioning with fine-grained and semantic-guided visual attention. IEEE Transactions on Multimedia,21(7), 1681\u20131693.","journal-title":"IEEE Transactions on Multimedia"},{"key":"2686_CR70","doi-asserted-by":"crossref","unstructured":"Zhou, F., & De la Torre, F. (2013). Deformable graph matching. Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 2922\u20132929).","DOI":"10.1109\/CVPR.2013.376"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02686-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-025-02686-y","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02686-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T08:35:29Z","timestamp":1774600529000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-025-02686-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2,21]]},"references-count":70,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2026,3]]}},"alternative-id":["2686"],"URL":"https:\/\/doi.org\/10.1007\/s11263-025-02686-y","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,2,21]]},"assertion":[{"value":"13 May 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 September 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 February 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"127"}}