{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,27]],"date-time":"2026-01-27T15:29:31Z","timestamp":1769527771919,"version":"3.49.0"},"reference-count":35,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2026,1,24]],"date-time":"2026-01-24T00:00:00Z","timestamp":1769212800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,24]],"date-time":"2026-01-24T00:00:00Z","timestamp":1769212800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62402160"],"award-info":[{"award-number":["62402160"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003787","name":"Natural Science Foundation of Hebei Province","doi-asserted-by":"publisher","award":["F2024202078"],"award-info":[{"award-number":["F2024202078"]}],"id":[{"id":"10.13039\/501100003787","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Supercomput"],"DOI":"10.1007\/s11227-025-08208-4","type":"journal-article","created":{"date-parts":[[2026,1,24]],"date-time":"2026-01-24T10:39:15Z","timestamp":1769251155000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Multi-scale feature and historical attention-based cross-modal image\u2013text matching model"],"prefix":"10.1007","volume":"82","author":[{"given":"Liqin","family":"Wang","sequence":"first","affiliation":[]},{"given":"Jiayi","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Pengcheng","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Yongfeng","family":"Dong","sequence":"additional","affiliation":[]},{"given":"Xu","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,1,24]]},"reference":[{"key":"8208_CR1","doi-asserted-by":"crossref","unstructured":"Yang Y, Li Z, Dong Q, Xia H, Sui Z (2024) Can large multimodal models uncover deep semantics behind images? https:\/\/arxiv.org\/abs\/2402.11281","DOI":"10.18653\/v1\/2024.findings-acl.113"},{"key":"8208_CR2","doi-asserted-by":"publisher","unstructured":"Salzmann T, Ryll M, Bewley A, Minderer M (2025) Scene-graph vit: end-to-end open-vocabulary visual relationship detection. In: Leonardis A, Ricci E, Roth S, Russakovsky O, Sattler T, Varol G (eds) Computer Vision\u2014ECCV 2024, Springer, Cham, pp 195\u2013213. https:\/\/doi.org\/10.1007\/978-3-031-72907-2_12","DOI":"10.1007\/978-3-031-72907-2_12"},{"issue":"1","key":"8208_CR3","doi-asserted-by":"publisher","first-page":"138","DOI":"10.1007\/s11227-024-06652-2","volume":"81","author":"L Wang","year":"2025","unstructured":"Wang L, Yang P, Wang X, Xu Z, Dong Y (2025) Scene graph fusion and negative sample generation strategy for image-text matching. J Supercomput 81(1):138. https:\/\/doi.org\/10.1007\/s11227-024-06652-2","journal-title":"J Supercomput"},{"key":"8208_CR4","doi-asserted-by":"publisher","first-page":"207","DOI":"10.1162\/tacl_a_00177","volume":"2","author":"R Socher","year":"2014","unstructured":"Socher R, Karpathy A, Le QV, Manning CD, Ng AY (2014) Grounded compositional semantics for finding and describing images with sentences. Trans Assoc Comput Linguis 2:207\u2013218. https:\/\/doi.org\/10.1162\/tacl_a_00177","journal-title":"Trans Assoc Comput Linguis"},{"key":"8208_CR5","doi-asserted-by":"crossref","unstructured":"Huang Y, Wu Q, Wang L (2018) Learning semantic concepts and order for image and sentence matching. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 6163\u20136171. https:\/\/arxiv.org\/abs\/1712.02036","DOI":"10.1109\/CVPR.2018.00645"},{"key":"8208_CR6","doi-asserted-by":"publisher","first-page":"617","DOI":"10.1109\/TIP.2020.3038354","volume":"30","author":"Y Zhang","year":"2021","unstructured":"Zhang Y, Zhou W, Wang M, Tian Q, Li H (2021) Deep relation embedding for cross-modal retrieval. IEEE Trans Image Process 30:617\u2013627. https:\/\/doi.org\/10.1109\/TIP.2020.3038354","journal-title":"IEEE Trans Image Process"},{"key":"8208_CR7","doi-asserted-by":"crossref","unstructured":"Cho K, Van\u00a0Merri\u00ebnboer B, Gulcehre C, Bahdanau D, Bougares F, Schwenk H, Bengio Y (2014) Learning phrase representations using RNN encoder\u2013decoder for statistical machine translation. arXiv:1406.1078 [cs.CL]","DOI":"10.3115\/v1\/D14-1179"},{"key":"8208_CR8","doi-asserted-by":"publisher","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 770\u2013778. https:\/\/doi.org\/10.1109\/CVPR.2016.90","DOI":"10.1109\/CVPR.2016.90"},{"key":"8208_CR9","doi-asserted-by":"publisher","unstructured":"Qu L, Liu M, Cao D, Nie L, Tian Q (2020) Context-aware multi-view summarization network for image-text matching. In: Proceedings of the 28th ACM International Conference on Multimedia. MM \u201920, Association for Computing Machinery, New York, NY, USA, pp 1047\u20131055. https:\/\/doi.org\/10.1145\/3394171.3413961","DOI":"10.1145\/3394171.3413961"},{"key":"8208_CR10","doi-asserted-by":"publisher","unstructured":"Yan S, Yu L, Xie Y (2021) Discrete-continuous action space policy gradient-based attention for image-text matching. In: 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 8092\u20138101. https:\/\/doi.org\/10.1109\/CVPR46437.2021.00800","DOI":"10.1109\/CVPR46437.2021.00800"},{"issue":"5","key":"8208_CR11","doi-asserted-by":"publisher","first-page":"3261","DOI":"10.1109\/TCYB.2020.3009004","volume":"52","author":"X Xu","year":"2022","unstructured":"Xu X, Lin K, Gao L, Lu H, Shen HT, Li X (2022) Learning cross-modal common representations by private shared subspaces separation. IEEE Trans Cybern 52(5):3261\u20133275. https:\/\/doi.org\/10.1109\/TCYB.2020.3009004","journal-title":"IEEE Trans Cybern"},{"key":"8208_CR12","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2023.102084","volume":"103","author":"S Huang","year":"2024","unstructured":"Huang S, Fu W, Zhang Z, Liu S (2024) Global-local fusion based on adversarial sample generation for image-text matching. Inf Fusion 103:102084. https:\/\/doi.org\/10.1016\/j.inffus.2023.102084","journal-title":"Inf Fusion"},{"key":"8208_CR13","doi-asserted-by":"publisher","unstructured":"Nam H, Ha J-W, Kim J (2017) Dual attention networks for multimodal reasoning and matching. In: 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 2156\u20132164. https:\/\/doi.org\/10.1109\/CVPR.2017.232","DOI":"10.1109\/CVPR.2017.232"},{"key":"8208_CR14","doi-asserted-by":"publisher","unstructured":"Wei X, Zhang T, Li Y, Zhang Y, Wu F (2020) Multi-modality cross attention network for image and sentence matching. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 10938\u201310947. https:\/\/doi.org\/10.1109\/CVPR42600.2020.01095","DOI":"10.1109\/CVPR42600.2020.01095"},{"key":"8208_CR15","doi-asserted-by":"publisher","unstructured":"Liu Y, Wang H, Meng F, Liu M, Liu H (2021) Attend, correct and focus: a bidirectional correct attention network for image-text matching. In: 2021 IEEE International Conference on Image Processing (ICIP), pp 2673\u20132677. https:\/\/doi.org\/10.1109\/ICIP42928.2021.9506438","DOI":"10.1109\/ICIP42928.2021.9506438"},{"key":"8208_CR16","doi-asserted-by":"publisher","unstructured":"Chen H, Ding G, Liu X, Lin Z, Liu J, Han J (2020) Imram: iterative matching with recurrent attention memory for cross-modal image-text retrieval. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 12652\u201312660. https:\/\/doi.org\/10.1109\/CVPR42600.2020.01267","DOI":"10.1109\/CVPR42600.2020.01267"},{"key":"8208_CR17","doi-asserted-by":"publisher","unstructured":"Pan Z, Wu F, Zhang B (2023) Fine-grained image-text matching by cross-modal hard aligning network. In: 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 19275\u201319284. https:\/\/doi.org\/10.1109\/CVPR52729.2023.01847","DOI":"10.1109\/CVPR52729.2023.01847"},{"issue":"4","key":"8208_CR18","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3631356","volume":"20","author":"T Yao","year":"2023","unstructured":"Yao T, Li Y, Li Y, Zhu Y, Wang G, Yue J (2023) Cross-modal semantically augmented network for image-text matching. ACM Trans Multimedia Comput Commun Appl 20(4):1. https:\/\/doi.org\/10.1145\/3631356","journal-title":"ACM Trans Multimedia Comput Commun Appl"},{"issue":"5","key":"8208_CR19","doi-asserted-by":"publisher","first-page":"2313","DOI":"10.1109\/TPAMI.2020.3042192","volume":"44","author":"X Yang","year":"2022","unstructured":"Yang X, Zhang H, Cai J (2022) Auto-encoding and distilling scene graphs for image captioning. IEEE Trans Pattern Anal Mach Intell 44(5):2313\u20132327. https:\/\/doi.org\/10.1109\/TPAMI.2020.3042192","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"8208_CR20","doi-asserted-by":"crossref","unstructured":"Juan D-C, Lu C-T, Li Z, Peng F, Timofeev A, Chen Y-T, Gao Y, Duerig T, Tomkins A, Ravi S (2019) Graph-rise: graph-regularized image semantic embedding. arXiv preprint arXiv:1902.10814","DOI":"10.1145\/3336191.3371784"},{"key":"8208_CR21","doi-asserted-by":"publisher","unstructured":"Wang S, Wang R, Yao Z, Shan S, Chen X (2020) Cross-modal scene graph matching for relationship-aware image-text retrieval. In: 2020 IEEE Winter Conference on Applications of Computer Vision (WACV), pp 1497\u20131506. https:\/\/doi.org\/10.1109\/WACV45572.2020.9093614","DOI":"10.1109\/WACV45572.2020.9093614"},{"key":"8208_CR22","doi-asserted-by":"publisher","unstructured":"Liu C, Mao Z, Zhang T, Xie H, Wang B, Zhang Y (2020) Graph structured network for image-text matching. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 10918\u201310927. https:\/\/doi.org\/10.1109\/CVPR42600.2020.01093","DOI":"10.1109\/CVPR42600.2020.01093"},{"key":"8208_CR23","doi-asserted-by":"publisher","unstructured":"Diao H, Zhang Y, Ma L, Lu H (2021) Similarity reasoning and filtration for image-text matching. 35:1218\u20131226. https:\/\/doi.org\/10.1609\/aaai.v35i2.16209","DOI":"10.1609\/aaai.v35i2.16209"},{"key":"8208_CR24","doi-asserted-by":"publisher","first-page":"593","DOI":"10.1016\/j.neucom.2022.11.003","volume":"518","author":"X Yang","year":"2023","unstructured":"Yang X, Li C, Zheng D, Wen P, Yin G (2023) Rfe-srn: image-text similarity reasoning network based on regional feature enhancement. Neurocomputing 518:593\u2013601. https:\/\/doi.org\/10.1016\/j.neucom.2022.11.003","journal-title":"Neurocomputing"},{"key":"8208_CR25","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2024.128082","volume":"599","author":"W Wang","year":"2024","unstructured":"Wang W, Di X, Liu M, Gao F (2024) Multi-level symmetric semantic alignment network for image text matching. Neurocomputing 599:128082. https:\/\/doi.org\/10.1016\/j.neucom.2024.128082","journal-title":"Neurocomputing"},{"key":"8208_CR26","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2024.110273","volume":"149","author":"R Yang","year":"2024","unstructured":"Yang R, Wang S, Gu Y, Wang J, Sun Y, Zhang H, Liao Y, Jiao L (2024) Continual learning for cross-modal image-text retrieval based on domain-selective attention. Pattern Recogn 149:110273. https:\/\/doi.org\/10.1016\/j.patcog.2024.110273","journal-title":"Pattern Recogn"},{"key":"8208_CR27","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2024.111550","volume":"291","author":"Y Ding","year":"2024","unstructured":"Ding Y, Yu J, Lv Q, Zhao H, Dong J, Li Y (2024) Multiview adaptive attention pooling for image text retrieval. Knowl Based Syst 291:111550. https:\/\/doi.org\/10.1016\/j.knosys.2024.111550","journal-title":"Knowl Based Syst"},{"key":"8208_CR28","doi-asserted-by":"publisher","unstructured":"Plummer BA, Wang L, Cervantes CM, Caicedo JC, Hockenmaier J, Lazebnik S (2015) Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. In: 2015 IEEE International Conference on Computer Vision (ICCV), pp 2641\u20132649. https:\/\/doi.org\/10.1109\/ICCV.2015.303","DOI":"10.1109\/ICCV.2015.303"},{"key":"8208_CR29","doi-asserted-by":"crossref","unstructured":"Lin T-Y, Maire M, Belongie S, Hays J, Perona P, Ramanan D, Doll\u00e1r P, Zitnick CL (2014) Microsoft coco: common objects in context. In: Fleet D, Pajdla T, Schiele B, Tuytelaars T (eds) Computer Vision\u2014ECCV 2014, Springer, Cham, pp 740\u2013755","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"8208_CR30","unstructured":"Faghri F, Fleet DJ, Kiros JR, Fidler S (2018) Vse++: improving visual-semantic embeddings with hard negatives. arXiv preprint arXiv:1707.05612"},{"key":"8208_CR31","doi-asserted-by":"publisher","unstructured":"Wang Z, Liu X, Li H, Sheng L, Yan J, Wang X, Shao J (2019) Camp: cross-modal adaptive message passing for text-image retrieval. In: 2019 IEEE\/CVF International Conference on Computer Vision (ICCV), pp 5763\u20135772. https:\/\/doi.org\/10.1109\/ICCV.2019.00586","DOI":"10.1109\/ICCV.2019.00586"},{"key":"8208_CR32","doi-asserted-by":"publisher","unstructured":"Zhang Q, Lei Z, Zhang Z, Li SZ (2020) Context-aware attention network for image-text retrieval. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 3533\u20133542. https:\/\/doi.org\/10.1109\/CVPR42600.2020.00359","DOI":"10.1109\/CVPR42600.2020.00359"},{"issue":"3","key":"8208_CR33","doi-asserted-by":"publisher","first-page":"41","DOI":"10.1109\/MIS.2023.3265176","volume":"38","author":"H Shang","year":"2023","unstructured":"Shang H, Zhao G, Shi J, Qian X (2023) A multiview text imagination network based on latent alignment for image-text matching. IEEE Intell Syst 38(3):41\u201350. https:\/\/doi.org\/10.1109\/MIS.2023.3265176","journal-title":"IEEE Intell Syst"},{"issue":"5","key":"8208_CR34","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3563390","volume":"22","author":"J Pei","year":"2023","unstructured":"Pei J, Zhong K, Yu Z, Wang L, Lakshmanna K (2023) Scene graph semantic inference for image and text matching. ACM Trans Asian Low Resour Lang Inf Process. 22(5):1. https:\/\/doi.org\/10.1145\/3563390","journal-title":"ACM Trans Asian Low Resour Lang Inf Process."},{"issue":"4","key":"8208_CR35","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3572844","volume":"19","author":"S Yang","year":"2023","unstructured":"Yang S, Li Q, Li W, Li X-Y, Jin R, Lv B, Wang R, Liu A (2023) Semantic completion and filtration for image text retrieval. ACM Trans Multimedia Comput Commun Appl 19(4):1. https:\/\/doi.org\/10.1145\/3572844","journal-title":"ACM Trans Multimedia Comput Commun Appl"}],"container-title":["The Journal of Supercomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-025-08208-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11227-025-08208-4","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-025-08208-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,24]],"date-time":"2026-01-24T10:39:16Z","timestamp":1769251156000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11227-025-08208-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1,24]]},"references-count":35,"journal-issue":{"issue":"2","published-online":{"date-parts":[[2026,1]]}},"alternative-id":["8208"],"URL":"https:\/\/doi.org\/10.1007\/s11227-025-08208-4","relation":{},"ISSN":["1573-0484"],"issn-type":[{"value":"1573-0484","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,1,24]]},"assertion":[{"value":"16 July 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 December 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 January 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"95"}}