{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,4]],"date-time":"2026-03-04T17:07:01Z","timestamp":1772644021937,"version":"3.50.1"},"reference-count":73,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,12,15]],"date-time":"2025-12-15T00:00:00Z","timestamp":1765756800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,12,15]],"date-time":"2025-12-15T00:00:00Z","timestamp":1765756800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science  Foundation of China","doi-asserted-by":"crossref","award":["61873160"],"award-info":[{"award-number":["61873160"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"name":"Natural Science Foundation of Shanghai","award":["25ZR1401156"],"award-info":[{"award-number":["25ZR1401156"]}]},{"name":"Natural Science Foundation of  Shanghai","award":["25ZR1402184"],"award-info":[{"award-number":["25ZR1402184"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Vis Comput"],"published-print":{"date-parts":[[2026,1]]},"DOI":"10.1007\/s00371-025-04250-8","type":"journal-article","created":{"date-parts":[[2025,12,15]],"date-time":"2025-12-15T18:00:20Z","timestamp":1765821620000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Enhancing image-text matching through contextual fine-grained alignment"],"prefix":"10.1007","volume":"42","author":[{"given":"FanRong","family":"Meng","sequence":"first","affiliation":[]},{"given":"Dezhi","family":"Han","sequence":"additional","affiliation":[]},{"given":"Xiang","family":"Shen","sequence":"additional","affiliation":[]},{"given":"Chongqing","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Qun","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,12,15]]},"reference":[{"key":"4250_CR1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2025.3558446","author":"Q Lin","year":"2025","unstructured":"Lin, Q., He, K., Zhu, Y., Xu, F., Cambria, E., Feng, M.: Cross-modal knowledge diffusion-based generation for difference-aware medical VQA. IEEE Trans. Image Process. (2025). https:\/\/doi.org\/10.1109\/TIP.2025.3558446","journal-title":"IEEE Trans. Image Process."},{"key":"4250_CR2","unstructured":"Wu, W., Li, Z., He, Y., Shou, M.Z., Shen, C., Cheng, L., Li, Y., Gao, T., Zhang, D.: Paragraph-to-image generation with information-enriched diffusion model. International Journal of Computer Vision, 1\u201322 (2025)"},{"issue":"7","key":"4250_CR3","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3715093","volume":"57","author":"M Ma\u0142ki\u0144ski","year":"2025","unstructured":"Ma\u0142ki\u0144ski, M., Ma\u0144dziuk, J.: Deep learning methods for abstract visual reasoning: a survey on raven\u2019s progressive matrices. ACM Comput. Surv. 57(7), 1\u201336 (2025)","journal-title":"ACM Comput. Surv."},{"key":"4250_CR4","unstructured":"Faghri, F., Fleet, D.J., Kiros, J.R., Fidler, S.: Vse++: improving visual-semantic embeddings with hard negatives. arXiv preprint arXiv:1707.05612 (2017)"},{"key":"4250_CR5","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Fei-Fei, L.: Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3128\u20133137 (2015)","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"4250_CR6","doi-asserted-by":"crossref","unstructured":"Chen, H., Ding, G., Liu, X., Lin, Z., Liu, J., Han, J.: Imram: iterative matching with recurrent attention memory for cross-modal image-text retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12655\u201312663 (2020)","DOI":"10.1109\/CVPR42600.2020.01267"},{"key":"4250_CR7","doi-asserted-by":"crossref","unstructured":"Devlin, J., Chang, M.-W., Lee, K., Toutanova, K.: Bert: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (long and Short Papers), pp. 4171\u20134186 (2019)","DOI":"10.18653\/v1\/N19-1423"},{"key":"4250_CR8","doi-asserted-by":"crossref","unstructured":"Li, K., Zhang, Y., Li, K., Li, Y., Fu, Y.: Visual semantic reasoning for image-text matching. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4654\u20134662 (2019)","DOI":"10.1109\/ICCV.2019.00475"},{"key":"4250_CR9","unstructured":"Li, Y., Gu, C., Dullien, T., Vinyals, O., Kohli, P.: Graph matching networks for learning the similarity of graph structured objects. In: International Conference on Machine Learning, pp. 3835\u20133845 (2019). PMLR"},{"key":"4250_CR10","doi-asserted-by":"crossref","unstructured":"Liu, C., Mao, Z., Zhang, T., Xie, H., Wang, B., Zhang, Y.: Graph structured network for image-text matching. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10921\u201310930 (2020)","DOI":"10.1109\/CVPR42600.2020.01093"},{"issue":"3","key":"4250_CR11","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3714431","volume":"16","author":"X Ge","year":"2025","unstructured":"Ge, X., Chen, F., Xu, S., Tao, F., Wang, J., Jose, J.M.: Hire: hybrid-modal interaction with multiple relational enhancements for image-text matching. ACM Trans. Intell. Syst. Technol. 16(3), 1\u201323 (2025)","journal-title":"ACM Trans. Intell. Syst. Technol."},{"issue":"2","key":"4250_CR12","doi-asserted-by":"publisher","first-page":"684","DOI":"10.1109\/TPAMI.2019.2911066","volume":"44","author":"R Hong","year":"2019","unstructured":"Hong, R., Liu, D., Mo, X., He, X., Zhang, H.: Learning to compose and reason with language tree structures for visual grounding. IEEE Trans. Pattern Anal. Mach. Intell. 44(2), 684\u2013696 (2019)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"2","key":"4250_CR13","doi-asserted-by":"publisher","first-page":"2194","DOI":"10.1109\/TNNLS.2022.3188569","volume":"35","author":"S-J Peng","year":"2022","unstructured":"Peng, S.-J., He, Y., Liu, X., Cheung, Y.-M., Xu, X., Cui, Z.: Relation-aggregated cross-graph correlation learning for fine-grained image-text retrieval. IEEE Trans. Neural Netw. Learn. Syst. 35(2), 2194\u20132207 (2022)","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"4250_CR14","doi-asserted-by":"crossref","unstructured":"Li, L., Gan, Z., Cheng, Y., Liu, J.: Relation-aware graph attention network for visual question answering. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10313\u201310322 (2019)","DOI":"10.1109\/ICCV.2019.01041"},{"issue":"1","key":"4250_CR15","first-page":"1","volume":"20","author":"J Wang","year":"2023","unstructured":"Wang, J., Shuai, H.-H., Li, Y.-H., Cheng, W.-H.: Language-guided residual graph attention network and data augmentation for visual grounding. ACM Trans. Multimed. Comput. Commun. Appl. 20(1), 1\u201323 (2023)","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl."},{"key":"4250_CR16","doi-asserted-by":"crossref","unstructured":"Wang, Y., Yang, H., Qian, X., Ma, L., Lu, J., Li, B., Fan, X.: Position focused attention network for image-text matching. arXiv preprint arXiv:1907.09748 (2019)","DOI":"10.24963\/ijcai.2019\/526"},{"key":"4250_CR17","doi-asserted-by":"publisher","first-page":"6997","DOI":"10.1109\/TMM.2022.3216770","volume":"25","author":"A Mao","year":"2022","unstructured":"Mao, A., Yang, Z., Lin, K., Xuan, J., Liu, Y.-J.: Positional attention guided transformer-like architecture for visual question answering. IEEE Trans. Multimed. 25, 6997\u20137009 (2022)","journal-title":"IEEE Trans. Multimed."},{"key":"4250_CR18","unstructured":"Xia, Y., Huang, L., Wang, W., Wei, X.: Parnet: position-aware aggregated relation network for image-text matching. arXiv preprint arXiv:1906.06892 (2019)"},{"key":"4250_CR19","doi-asserted-by":"publisher","first-page":"5060","DOI":"10.1109\/TIP.2023.3310332","volume":"32","author":"S Zhou","year":"2023","unstructured":"Zhou, S., Guo, D., Li, J., Yang, X., Wang, M.: Exploring sparse spatial relation in graph inference for text-based VQA. IEEE Trans. Image Process. 32, 5060\u20135074 (2023)","journal-title":"IEEE Trans. Image Process."},{"issue":"2","key":"4250_CR20","doi-asserted-by":"publisher","first-page":"948","DOI":"10.1109\/TCYB.2022.3179020","volume":"54","author":"X Liu","year":"2022","unstructured":"Liu, X., He, Y., Cheung, Y.-M., Xu, X., Wang, N.: Learning relationship-enhanced semantic graph for fine-grained image-text matching. IEEE Trans. Cybern. 54(2), 948\u2013961 (2022)","journal-title":"IEEE Trans. Cybern."},{"key":"4250_CR21","unstructured":"Xie, X., Hou, C., Li, Z.: Fine-grained matching with multi-perspective similarity modeling for cross-modal retrieval. In: Uncertainty in Artificial Intelligence, pp. 2148\u20132158 (2022). PMLR"},{"key":"4250_CR22","doi-asserted-by":"publisher","first-page":"9189","DOI":"10.1109\/TMM.2023.3248160","volume":"25","author":"J Guo","year":"2023","unstructured":"Guo, J., Wang, M., Zhou, Y., Song, B., Chi, Y., Fan, W., Chang, J.: Hgan: hierarchical graph alignment network for image-text retrieval. IEEE Trans. Multimed. 25, 9189\u20139202 (2023)","journal-title":"IEEE Trans. Multimed."},{"key":"4250_CR23","doi-asserted-by":"publisher","first-page":"7555","DOI":"10.1109\/TMM.2024.3369968","volume":"26","author":"S Pang","year":"2024","unstructured":"Pang, S., Zeng, Y., Zhao, J., Xue, J.: A mutually textual and visual refinement network for image-text matching. IEEE Trans. Multimed. 26, 7555\u20137566 (2024)","journal-title":"IEEE Trans. Multimed."},{"key":"4250_CR24","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2023.110084","volume":"147","author":"C Chen","year":"2024","unstructured":"Chen, C., Han, D., Chang, C.-C.: Mpcct: multimodal vision-language learning paradigm with context-based compact transformer. Pattern Recogn. 147, 110084 (2024)","journal-title":"Pattern Recogn."},{"key":"4250_CR25","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2025.128857","volume":"295","author":"X Shen","year":"2025","unstructured":"Shen, X., Han, D., Chang, C.-C., Xu, Y., Chen, C.: Multimodal context-aware consistency alignment for vision-language tasks. Expert Syst. Appl. 295, 128857 (2025)","journal-title":"Expert Syst. Appl."},{"key":"4250_CR26","doi-asserted-by":"crossref","unstructured":"Wang, L., Li, Y., Lazebnik, S.: Learning deep structure-preserving image-text embeddings. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5005\u20135013 (2016)","DOI":"10.1109\/CVPR.2016.541"},{"issue":"9","key":"4250_CR27","doi-asserted-by":"publisher","first-page":"6437","DOI":"10.1109\/TCSVT.2022.3164230","volume":"32","author":"X Dong","year":"2022","unstructured":"Dong, X., Zhang, H., Zhu, L., Nie, L., Liu, L.: Hierarchical feature aggregation based on transformer for image-text matching. IEEE Trans. Circuits Syst. Video Technol. 32(9), 6437\u20136447 (2022)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"4250_CR28","doi-asserted-by":"crossref","unstructured":"Ji, Z., Wang, H., Han, J., Pang, Y.: Saliency-guided attention network for image-sentence matching. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5754\u20135763 (2019)","DOI":"10.1109\/ICCV.2019.00585"},{"key":"4250_CR29","doi-asserted-by":"crossref","unstructured":"Chen, J., Hu, H., Wu, H., Jiang, Y., Wang, C.: Learning the best pooling strategy for visual semantic embedding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15789\u201315798 (2021)","DOI":"10.1109\/CVPR46437.2021.01553"},{"key":"4250_CR30","doi-asserted-by":"crossref","unstructured":"Lee, K.-H., Chen, X., Hua, G., Hu, H., He, X.: Stacked cross attention for image-text matching. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 201\u2013216 (2018)","DOI":"10.1007\/978-3-030-01225-0_13"},{"issue":"6","key":"4250_CR31","doi-asserted-by":"publisher","first-page":"1137","DOI":"10.1109\/TPAMI.2016.2577031","volume":"39","author":"S Ren","year":"2016","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: towards real-time object detection with region proposal networks. IEEE Trans. Pattern Anal. Mach. Intell. 39(6), 1137\u20131149 (2016)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"4250_CR32","doi-asserted-by":"crossref","unstructured":"Zhang, K., Mao, Z., Wang, Q., Zhang, Y.: Negative-aware attention framework for image-text matching. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15661\u201315670 (2022)","DOI":"10.1109\/CVPR52688.2022.01521"},{"key":"4250_CR33","doi-asserted-by":"crossref","unstructured":"Huang, Y., Wu, Q., Song, C., Wang, L.: Learning semantic concepts and order for image and sentence matching. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6163\u20136171 (2018)","DOI":"10.1109\/CVPR.2018.00645"},{"key":"4250_CR34","doi-asserted-by":"publisher","first-page":"6590","DOI":"10.1109\/TCSVT.2024.3369656","volume":"34","author":"Z Li","year":"2024","unstructured":"Li, Z., Zhang, L., Zhang, K., Zhang, Y., Mao, Z.: Improving image-text matching with bidirectional consistency of cross-modal alignment. IEEE Trans. Circuits Syst. Video Technol. 34, 6590\u20136607 (2024)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"7","key":"4250_CR35","doi-asserted-by":"publisher","first-page":"4926","DOI":"10.1109\/TPAMI.2024.3365104","volume":"46","author":"Y Tu","year":"2024","unstructured":"Tu, Y., Li, L., Su, L., Zha, Z.-J., Huang, Q.: Smart: syntax-calibrated multi-aspect relation transformer for change captioning. IEEE Trans. Pattern Anal. Mach. Intell. 46(7), 4926\u20134943 (2024)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"9","key":"4250_CR36","doi-asserted-by":"publisher","first-page":"2354","DOI":"10.1109\/TMM.2019.2957948","volume":"22","author":"X Fu","year":"2019","unstructured":"Fu, X., Zhao, Y., Wei, Y., Zhao, Y., Wei, S.: Rich features embedding for cross-modal retrieval: a simple baseline. IEEE Trans. Multimed. 22(9), 2354\u20132365 (2019)","journal-title":"IEEE Trans. Multimed."},{"key":"4250_CR37","doi-asserted-by":"crossref","unstructured":"Qu, L., Liu, M., Wu, J., Gao, Z., Nie, L.: Dynamic modality interaction modeling for image-text retrieval. In: Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 1104\u20131113 (2021)","DOI":"10.1145\/3404835.3462829"},{"key":"4250_CR38","doi-asserted-by":"crossref","unstructured":"Wu, Y., Wang, S., Huang, Q.: Learning semantic structure-preserved embeddings for cross-modal retrieval. In: Proceedings of the 26th ACM International Conference on Multimedia, pp. 825\u2013833 (2018)","DOI":"10.1145\/3240508.3240521"},{"issue":"1","key":"4250_CR39","doi-asserted-by":"publisher","first-page":"641","DOI":"10.1109\/TPAMI.2022.3148470","volume":"45","author":"K Li","year":"2022","unstructured":"Li, K., Zhang, Y., Li, K., Li, Y., Fu, Y.: Image-text embedding learning via visual and textual semantic reasoning. IEEE Trans. Pattern Anal. Mach. Intell. 45(1), 641\u2013656 (2022)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"4250_CR40","doi-asserted-by":"crossref","unstructured":"Ji, Z., Chen, K., Wang, H.: Step-wise hierarchical alignment network for image-text matching. arXiv preprint arXiv:2106.06509 (2021)","DOI":"10.24963\/ijcai.2021\/106"},{"key":"4250_CR41","doi-asserted-by":"crossref","unstructured":"Tu, Y., Li, L., Su, L., Huang, Q.: Query-centric audio-visual cognition network for moment retrieval, segmentation and step-captioning. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 39, pp. 7464\u20137472 (2025)","DOI":"10.1609\/aaai.v39i7.32803"},{"key":"4250_CR42","doi-asserted-by":"crossref","unstructured":"Shi, B., Ji, L., Lu, P., Niu, Z., Duan, N.: Knowledge aware semantic concept expansion for image-text matching. In: IJCAI, vol. 1, p. 2 (2019)","DOI":"10.24963\/ijcai.2019\/720"},{"key":"4250_CR43","doi-asserted-by":"publisher","first-page":"3565","DOI":"10.1109\/TIP.2022.3159472","volume":"31","author":"Y Tu","year":"2022","unstructured":"Tu, Y., Li, L., Su, L., Gao, S., Yan, C., Zha, Z.-J., Yu, Z., Huang, Q.: $${\\rm I}^2$$ transformer: intra-and inter-relation embedding transformer for TV show captioning. IEEE Trans. Image Process. 31, 3565\u20133577 (2022)","journal-title":"IEEE Trans. Image Process."},{"key":"4250_CR44","doi-asserted-by":"publisher","first-page":"3362","DOI":"10.1109\/TMM.2020.3024822","volume":"23","author":"Y Wang","year":"2020","unstructured":"Wang, Y., Yang, H., Bai, X., Qian, X., Ma, L., Lu, J., Li, B., Fan, X.: Pfan++: bi-directional image-text retrieval with position focused attention network. IEEE Trans. Multimed. 23, 3362\u20133376 (2020)","journal-title":"IEEE Trans. Multimed."},{"key":"4250_CR45","doi-asserted-by":"crossref","unstructured":"Long, S., Han, S.C., Wan, X., Poon, J.: Gradual: graph-based dual-modal representation for image-text matching. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 3459\u20133468 (2022)","DOI":"10.1109\/WACV51458.2022.00252"},{"key":"4250_CR46","doi-asserted-by":"publisher","first-page":"8933","DOI":"10.1109\/TMM.2023.3243665","volume":"25","author":"C Chen","year":"2023","unstructured":"Chen, C., Wang, D., Song, B., Tan, H.: Inter-intra modal representation augmentation with dct-transformer adversarial network for image-text matching. IEEE Trans. Multimed. 25, 8933\u20138945 (2023)","journal-title":"IEEE Trans. Multimed."},{"key":"4250_CR47","doi-asserted-by":"crossref","unstructured":"Krishna, R., Zhu, Y., Groth, O., Johnson, J., Hata, K., Kravitz, J., Chen, S., Kalantidis, Y., Li, L.-J., Shamma, D.A., et al.: Connecting language and vision using crowdsourced dense image annotations. Vis. Genome (2016)","DOI":"10.1007\/s11263-016-0981-7"},{"key":"4250_CR48","doi-asserted-by":"crossref","unstructured":"Anderson, P., He, X., Buehler, C., Teney, D., Johnson, M., Gould, S., Zhang, L.: Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6077\u20136086 (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"4250_CR49","unstructured":"Bahdanau, D., Cho, K., Bengio, Y.: Neural machine translation by jointly learning to align and translate. arXiv preprint arXiv:1409.0473 (2014)"},{"key":"4250_CR50","unstructured":"Norcliffe-Brown, W., Vafeias, S., Parisot, S.: Learning conditioned graph structures for interpretable visual question answering. In: Advances in Neural Information Processing Systems, vol. 31 (2018)"},{"key":"4250_CR51","doi-asserted-by":"crossref","unstructured":"Bevilacqua, M., Blloshmi, R., Navigli, R.: One spring to rule them both: symmetric amr semantic parsing and generation without a complex pipeline. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 35, pp. 12564\u201312573 (2021)","DOI":"10.1609\/aaai.v35i14.17489"},{"key":"4250_CR52","doi-asserted-by":"crossref","unstructured":"Yang, B., Zhao, K., Tang, C., Liu, D., Zhan, L., Lin, C.: Emphasising structured information: integrating abstract meaning representation into llms for enhanced open-domain dialogue evaluation. arXiv preprint arXiv:2404.01129 (2024)","DOI":"10.18653\/v1\/2025.findings-emnlp.1096"},{"key":"4250_CR53","doi-asserted-by":"crossref","unstructured":"Diao, H., Zhang, Y., Ma, L., Lu, H.: Similarity reasoning and filtration for image-text matching. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 35, pp. 1218\u20131226 (2021)","DOI":"10.1609\/aaai.v35i2.16209"},{"key":"4250_CR54","doi-asserted-by":"crossref","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., Caicedo, J.C., Hockenmaier, J., Lazebnik, S.: Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2641\u20132649 (2015)","DOI":"10.1109\/ICCV.2015.303"},{"key":"4250_CR55","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., Zitnick, C.L.: Microsoft coco: common objects in context. In: Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6\u201312, 2014, Proceedings, Part V 13, pp. 740\u2013755 (2014). Springer","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"4250_CR56","unstructured":"Karpathy, A., Joulin, A., Fei-Fei, L.F.: Deep fragment embeddings for bidirectional image sentence mapping. In: Advances in Neural Information Processing Systems, vol. 27 (2014)"},{"key":"4250_CR57","doi-asserted-by":"publisher","first-page":"9193","DOI":"10.1109\/TIP.2021.3123553","volume":"30","author":"J Li","year":"2021","unstructured":"Li, J., Liu, L., Niu, L., Zhang, L.: Memorize, associate and match: embedding enhancement via fine-grained alignment for image-text retrieval. IEEE Trans. Image Process. 30, 9193\u20139207 (2021)","journal-title":"IEEE Trans. Image Process."},{"key":"4250_CR58","doi-asserted-by":"crossref","unstructured":"Li, J., Niu, L., Zhang, L.: Action-aware embedding enhancement for image-text retrieval. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 36, pp. 1323\u20131331 (2022)","DOI":"10.1609\/aaai.v36i2.20020"},{"key":"4250_CR59","doi-asserted-by":"crossref","unstructured":"Zhang, H., Mao, Z., Zhang, K., Zhang, Y.: Show your faith: cross-modal confidence-aware network for image-text matching. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 36, pp. 3262\u20133270 (2022)","DOI":"10.1609\/aaai.v36i3.20235"},{"issue":"1","key":"4250_CR60","doi-asserted-by":"publisher","first-page":"388","DOI":"10.1109\/TCSVT.2021.3060713","volume":"32","author":"J Wu","year":"2021","unstructured":"Wu, J., Wu, C., Lu, J., Wang, L., Cui, X.: Region reinforcement network with topic constraint for image-text matching. IEEE Trans. Circuits Syst. Video Technol. 32(1), 388\u2013397 (2021)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"4250_CR61","doi-asserted-by":"publisher","first-page":"1320","DOI":"10.1109\/TMM.2022.3141603","volume":"25","author":"K Zhang","year":"2022","unstructured":"Zhang, K., Mao, Z., Liu, A.-A., Zhang, Y.: Unified adaptive relevance distinguishable attention network for image-text matching. IEEE Trans. Multimed. 25, 1320\u20131332 (2022)","journal-title":"IEEE Trans. Multimed."},{"issue":"11","key":"4250_CR62","doi-asserted-by":"publisher","first-page":"8037","DOI":"10.1109\/TCSVT.2022.3182426","volume":"32","author":"S Yang","year":"2022","unstructured":"Yang, S., Li, Q., Li, W., Li, X., Liu, A.-A.: Dual-level representation enhancement on characteristic and context for image-text retrieval. IEEE Trans. Circuits Syst. Video Technol. 32(11), 8037\u20138050 (2022)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"4250_CR63","doi-asserted-by":"crossref","unstructured":"Li, Z., Guo, C., Feng, Z., Hwang, J.-N., Xue, X.: Multi-view visual semantic embedding. In: IJCAI vol. 2, p. 7 (2022)","DOI":"10.24963\/ijcai.2022\/158"},{"key":"4250_CR64","doi-asserted-by":"publisher","first-page":"2322","DOI":"10.1109\/TIP.2023.3266887","volume":"32","author":"H Diao","year":"2023","unstructured":"Diao, H., Zhang, Y., Liu, W., Ruan, X., Lu, H.: Plug-and-play regulators for image-text matching. IEEE Trans. Image Process. 32, 2322\u20132334 (2023)","journal-title":"IEEE Trans. Image Process."},{"issue":"10","key":"4250_CR65","doi-asserted-by":"publisher","first-page":"6144","DOI":"10.1109\/TCSVT.2023.3254530","volume":"33","author":"Y Wang","year":"2023","unstructured":"Wang, Y., Su, Y., Li, W., Xiao, J., Li, X., Liu, A.-A.: Dual-path rare content enhancement network for image and text matching. IEEE Trans. Circuits Syst. Video Technol. 33(10), 6144\u20136158 (2023)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"4250_CR66","doi-asserted-by":"crossref","unstructured":"Fu, Z., Mao, Z., Song, Y., Zhang, Y.: Learning semantic relationship among instances for image-text matching. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15159\u201315168 (2023)","DOI":"10.1109\/CVPR52729.2023.01455"},{"key":"4250_CR67","unstructured":"Wang, X., Yang, J., Wang, Q., Bi, Y.: Csa-net: cross-modal semantic awareness network for image-text retrieval. Available at SSRN 5099042"},{"key":"4250_CR68","doi-asserted-by":"crossref","unstructured":"Qu, L., Liu, M., Cao, D., Nie, L., Tian, Q.: Context-aware multi-view summarization network for image-text matching. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 1047\u20131055 (2020)","DOI":"10.1145\/3394171.3413961"},{"issue":"7","key":"4250_CR69","doi-asserted-by":"publisher","first-page":"2866","DOI":"10.1109\/TCSVT.2020.3030656","volume":"31","author":"K Wen","year":"2020","unstructured":"Wen, K., Gu, X., Cheng, Q.: Learning dual semantic relations with graph attention for image-text matching. IEEE Trans. Circuits Syst. Video Technol. 31(7), 2866\u20132879 (2020)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"4250_CR70","doi-asserted-by":"crossref","unstructured":"Pan, Z., Wu, F., Zhang, B.: Fine-grained image-text matching by cross-modal hard aligning network. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19275\u201319284 (2023)","DOI":"10.1109\/CVPR52729.2023.01847"},{"key":"4250_CR71","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Ji, Z., Wang, D., Pang, Y., Li, X.: User: unified semantic enhancement with momentum contrast for image-text retrieval. IEEE Trans. Image Process. 33, 595\u2013609 (2024)","DOI":"10.1109\/TIP.2023.3348297"},{"key":"4250_CR72","doi-asserted-by":"publisher","unstructured":"Wei, W., Gui, Z., Wu, C., Zhao, A., Peng, D., Wu, H.: Dynamic visual semantic sub-embeddings and fast re-ranking for image-text retrieval. IEEE Trans. Multimed. (2025). https:\/\/doi.org\/10.1109\/TMM.2025.3535373","DOI":"10.1109\/TMM.2025.3535373"},{"key":"4250_CR73","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2023.102084","volume":"103","author":"S Huang","year":"2024","unstructured":"Huang, S., Fu, W., Zhang, Z., Liu, S.: Global-local fusion based on adversarial sample generation for image-text matching. Inf. Fusion 103, 102084 (2024)","journal-title":"Inf. Fusion"}],"container-title":["The Visual Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-025-04250-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00371-025-04250-8","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-025-04250-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,4]],"date-time":"2026-03-04T13:03:30Z","timestamp":1772629410000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00371-025-04250-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,15]]},"references-count":73,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2026,1]]}},"alternative-id":["4250"],"URL":"https:\/\/doi.org\/10.1007\/s00371-025-04250-8","relation":{},"ISSN":["0178-2789","1432-2315"],"issn-type":[{"value":"0178-2789","type":"print"},{"value":"1432-2315","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,12,15]]},"assertion":[{"value":"18 May 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"16 October 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 December 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no Conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"70"}}