{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,9]],"date-time":"2026-06-09T02:39:34Z","timestamp":1780972774202,"version":"3.54.1"},"reference-count":65,"publisher":"Springer Science and Business Media LLC","issue":"12","license":[{"start":{"date-parts":[[2025,6,16]],"date-time":"2025-06-16T00:00:00Z","timestamp":1750032000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,6,16]],"date-time":"2025-06-16T00:00:00Z","timestamp":1750032000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61873160"],"award-info":[{"award-number":["61873160"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100007219","name":"Natural Science Foundation of Shanghai Municipality","doi-asserted-by":"publisher","award":["21ZR1426500"],"award-info":[{"award-number":["21ZR1426500"]}],"id":[{"id":"10.13039\/100007219","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Vis Comput"],"published-print":{"date-parts":[[2025,9]]},"DOI":"10.1007\/s00371-025-03981-y","type":"journal-article","created":{"date-parts":[[2025,6,16]],"date-time":"2025-06-16T07:54:54Z","timestamp":1750060494000},"page":"9555-9570","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["Enhancing image\u2013text matching through multi-level semantic consistency alignment"],"prefix":"10.1007","volume":"41","author":[{"given":"Liqi","family":"Zhu","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Dezhi","family":"Han","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiang","family":"Shen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Chongqing","family":"Chen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Kuan-Ching","family":"Li","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2025,6,16]]},"reference":[{"key":"3981_CR1","doi-asserted-by":"crossref","unstructured":"Li, J., Niu, L., Zhang, L.: Action-aware embedding enhancement for image-text retrieval. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 36, pp. 1323\u20131331 (2022)","DOI":"10.1609\/aaai.v36i2.20020"},{"issue":"6","key":"3981_CR2","doi-asserted-by":"publisher","first-page":"509","DOI":"10.1016\/j.vrih.2023.06.003","volume":"5","author":"M Wang","year":"2023","unstructured":"Wang, M., Meng, M., Liu, J., Wu, J.: Learning adequate alignment and interaction for cross-modal retrieval. Virtual Rea. Intell. Hardware 5(6), 509\u2013522 (2023)","journal-title":"Virtual Rea. Intell. Hardware"},{"issue":"6","key":"3981_CR3","doi-asserted-by":"publisher","first-page":"4503","DOI":"10.1109\/TCSVT.2023.3340225","volume":"34","author":"H Li","year":"2023","unstructured":"Li, H., Li, M., Peng, Q., Wang, S., Yu, H., Wang, Z.: Correlation-guided semantic consistency network for visible-infrared person re-identification. IEEE Trans. Circuits Syst. Video Technol. 34(6), 4503\u20134515 (2023)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"3981_CR4","doi-asserted-by":"crossref","unstructured":"Kuo, C.-W., Kira, Z.: HAAV: hierarchical aggregation of augmented views for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11039\u201311049 (2023)","DOI":"10.1109\/CVPR52729.2023.01062"},{"key":"3981_CR5","doi-asserted-by":"crossref","unstructured":"Zhang, C., Lin, K., Yang, Z., Wang, J., Li, L., Lin, C.-C., Liu, Z., Wang, L.: MM-Narrator: narrating long-form videos with multimodal in-context learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13647\u201313657 (2024)","DOI":"10.1109\/CVPR52733.2024.01295"},{"key":"3981_CR6","doi-asserted-by":"crossref","unstructured":"Li, K., Wang, Y., He, Y., Li, Y., Wang, Y., Liu, Y., Wang, Z., Xu, J., Chen, G., Luo, P.: MVBench: a comprehensive multi-modal video understanding benchmark. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22195\u201322206 (2024)","DOI":"10.1109\/CVPR52733.2024.02095"},{"key":"3981_CR7","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2022.108980","volume":"132","author":"C Chen","year":"2022","unstructured":"Chen, C., Han, D., Chang, C.-C.: CAAN: context-aware attention network for visual question answering. Pattern Recogn. 132, 108980 (2022)","journal-title":"Pattern Recogn."},{"issue":"4","key":"3981_CR8","doi-asserted-by":"publisher","first-page":"280","DOI":"10.1016\/j.vrih.2023.06.002","volume":"6","author":"H Zhang","year":"2024","unstructured":"Zhang, H., Wei, Z., Liu, G., Wang, R., Mu, R., Liu, C., Yuan, A., Cao, G., Hu, N.: MKEAH: multimodal knowledge extraction and accumulation based on hyperplane embedding for knowledge-based visual question answering. Virtual Real. Intell. Hardware 6(4), 280\u2013291 (2024)","journal-title":"Virtual Real. Intell. Hardware"},{"key":"3981_CR9","doi-asserted-by":"crossref","unstructured":"Wang, R., Chen, H., Yang, J., Xue, L.: Adaptive sparse triple convolutional attention for enhanced visual question answering. Vis. Comput., 1\u201317 (2025)","DOI":"10.1007\/s00371-025-03812-0"},{"key":"3981_CR10","unstructured":"Li, S., Tang, H.: Multimodal alignment and fusion: a survey. arXiv preprint arXiv:2411.17040 (2024)"},{"key":"3981_CR11","doi-asserted-by":"crossref","unstructured":"Cao, M., Li, S., Li, J., Nie, L., Zhang, M.: Image-text retrieval: a survey on recent research and development. arXiv preprint arXiv:2203.14713 (2022)","DOI":"10.24963\/ijcai.2022\/759"},{"key":"3981_CR12","unstructured":"Faghri, F., Fleet, D.J., Kiros, J.R., Fidler, S.: VSE++: improving visual-semantic embeddings with hard negatives. arXiv preprint arXiv:1707.05612 (2017)"},{"issue":"12","key":"3981_CR13","doi-asserted-by":"publisher","first-page":"5412","DOI":"10.1109\/TNNLS.2020.2967597","volume":"31","author":"X Xu","year":"2020","unstructured":"Xu, X., Wang, T., Yang, Y., Zuo, L., Shen, F., Shen, H.T.: Cross-modal attention with semantic consistence for image-text matching. IEEE Trans. Neural Netw. Learn. Syst. 31(12), 5412\u20135425 (2020)","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"3981_CR14","doi-asserted-by":"crossref","unstructured":"Zhang, K., Mao, Z., Wang, Q., Zhang, Y.: Negative-aware attention framework for image-text matching. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15661\u201315670 (2022)","DOI":"10.1109\/CVPR52688.2022.01521"},{"issue":"6","key":"3981_CR15","doi-asserted-by":"publisher","first-page":"4366","DOI":"10.1109\/TPAMI.2024.3355461","volume":"46","author":"S Wang","year":"2024","unstructured":"Wang, S., Chang, J., Wang, Z., Li, H., Ouyang, W., Tian, Q.: Content-aware rectified activation for zero-shot fine-grained image retrieval. IEEE Trans. Pattern Anal. Mach. Intell. 46(6), 4366\u20134380 (2024)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"3981_CR16","first-page":"64681","volume":"36","author":"S Wang","year":"2023","unstructured":"Wang, S., Chang, J., Li, H., Wang, Z., Ouyang, W., Tian, Q.: Learning to parameterize visual attributes for open-set fine-grained retrieval. Adv. Neural. Inf. Process. Syst. 36, 64681\u201364694 (2023)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"3981_CR17","doi-asserted-by":"crossref","unstructured":"Yao, J., Chen, J., Niu, L., Sheng, B.: Scene-aware human pose generation using transformer. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 2847\u20132855 (2023)","DOI":"10.1145\/3581783.3612439"},{"key":"3981_CR18","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2024.110900","volume":"157","author":"X Pu","year":"2025","unstructured":"Pu, X., Wang, Z., Yuan, L., Wu, Y., Jing, L., Gao, X.: GADNet: improving image-text matching via graph-based aggregation and disentanglement. Pattern Recogn. 157, 110900 (2025)","journal-title":"Pattern Recogn."},{"issue":"3","key":"3981_CR19","doi-asserted-by":"publisher","first-page":"2237","DOI":"10.1002\/cav.2237","volume":"35","author":"X Lin","year":"2024","unstructured":"Lin, X., Zhang, Y., Wang, S., Piao, X., Yin, B.: Multiagent trajectory prediction with global-local scene-enhanced social interaction graph network. Comput. Anim. Virtual Worlds 35(3), 2237 (2024)","journal-title":"Comput. Anim. Virtual Worlds"},{"key":"3981_CR20","doi-asserted-by":"crossref","unstructured":"Li, K., Zhang, Y., Li, K., Li, Y., Fu, Y.: Visual semantic reasoning for image-text matching. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4654\u20134662 (2019)","DOI":"10.1109\/ICCV.2019.00475"},{"key":"3981_CR21","doi-asserted-by":"publisher","first-page":"50","DOI":"10.1109\/TMM.2021.3120873","volume":"25","author":"X Lin","year":"2021","unstructured":"Lin, X., Sun, S., Huang, W., Sheng, B., Li, P., Feng, D.D.: EAPT: efficient attention pyramid transformer for image processing. IEEE Trans. Multimedia 25, 50\u201361 (2021)","journal-title":"IEEE Trans. Multimedia"},{"key":"3981_CR22","doi-asserted-by":"crossref","unstructured":"Chen, H., Ding, G., Liu, X., Lin, Z., Liu, J., Han, J.: IMRAM: iterative matching with recurrent attention memory for cross-modal image-text retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12655\u201312663 (2020)","DOI":"10.1109\/CVPR42600.2020.01267"},{"key":"3981_CR23","doi-asserted-by":"crossref","unstructured":"Chen, S., Zhao, Y., Jin, Q., Wu, Q.: Fine-grained video-text retrieval with hierarchical graph reasoning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10638\u201310647 (2020)","DOI":"10.1109\/CVPR42600.2020.01065"},{"key":"3981_CR24","doi-asserted-by":"crossref","unstructured":"Hong, S., Yang, D., Choi, J., Lee, H.: Inferring semantic layout for hierarchical text-to-image synthesis. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7986\u20137994 (2018)","DOI":"10.1109\/CVPR.2018.00833"},{"key":"3981_CR25","doi-asserted-by":"publisher","first-page":"853","DOI":"10.1613\/jair.3994","volume":"47","author":"M Hodosh","year":"2013","unstructured":"Hodosh, M., Young, P., Hockenmaier, J.: Framing image description as a ranking task: data, models and evaluation metrics. J. Artif. Intell. Res. 47, 853\u2013899 (2013)","journal-title":"J. Artif. Intell. Res."},{"key":"3981_CR26","unstructured":"Kiros, R., Salakhutdinov, R., Zemel, R.S.: Unifying visual-semantic embeddings with multimodal neural language models. arXiv preprint arXiv:1411.2539 (2014)"},{"key":"3981_CR27","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Fei-Fei, L.: Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3128\u20133137 (2015)","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"3981_CR28","doi-asserted-by":"crossref","unstructured":"Wang, L., Li, Y., Lazebnik, S.: Learning deep structure-preserving image-text embeddings. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5005\u20135013 (2016)","DOI":"10.1109\/CVPR.2016.541"},{"key":"3981_CR29","first-page":"9694","volume":"34","author":"J Li","year":"2021","unstructured":"Li, J., Selvaraju, R., Gotmare, A., Joty, S., Xiong, C., Hoi, S.C.H.: Align before fuse: vision and language representation learning with momentum distillation. Adv. Neural. Inf. Process. Syst. 34, 9694\u20139705 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"3981_CR30","doi-asserted-by":"crossref","unstructured":"Wang, J., Zhang, H., Zhong, Y., Liang, Y., Ji, R., Cang, Y.: Advanced multimodal deep learning architecture for image-text matching. In: 2024 IEEE 4th International Conference on Electronic Technology, Communication and Information (ICETCI), pp. 1185\u20131191 (2024). IEEE","DOI":"10.1109\/ICETCI61221.2024.10594167"},{"issue":"1","key":"3981_CR31","doi-asserted-by":"publisher","first-page":"137","DOI":"10.1007\/s11263-023-01873-z","volume":"132","author":"S Wang","year":"2024","unstructured":"Wang, S., Wang, Z., Li, H., Chang, J., Ouyang, W., Tian, Q.: Accurate fine-grained object recognition with structure-driven relation graph networks. Int. J. Comput. Vis. 132(1), 137\u2013160 (2024)","journal-title":"Int. J. Comput. Vis."},{"issue":"13","key":"3981_CR32","doi-asserted-by":"publisher","first-page":"16706","DOI":"10.1007\/s10489-022-04355-w","volume":"53","author":"X Shen","year":"2023","unstructured":"Shen, X., Han, D., Guo, Z., Chen, C., Hua, J., Luo, G.: Local self-attention in transformer for visual question answering. Appl. Intell. 53(13), 16706\u201316723 (2023)","journal-title":"Appl. Intell."},{"key":"3981_CR33","doi-asserted-by":"crossref","unstructured":"Pan, Z., Wu, F., Zhang, B.: Fine-grained image-text matching by cross-modal hard aligning network. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19275\u201319284 (2023)","DOI":"10.1109\/CVPR52729.2023.01847"},{"key":"3981_CR34","first-page":"7290","volume":"35","author":"J Li","year":"2022","unstructured":"Li, J., He, X., Wei, L., Qian, L., Zhu, L., Xie, L., Zhuang, Y., Tian, Q., Tang, S.: Fine-grained semantically aligned vision-language pre-training. Adv. Neural. Inf. Process. Syst. 35, 7290\u20137303 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"3981_CR35","doi-asserted-by":"crossref","unstructured":"Wang, S., Chang, J., Li, H., Wang, Z., Ouyang, W., Tian, Q.: Open-set fine-grained retrieval via prompting vision-language evaluator. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19381\u201319391 (2023)","DOI":"10.1109\/CVPR52729.2023.01857"},{"key":"3981_CR36","doi-asserted-by":"publisher","first-page":"88221","DOI":"10.1007\/s11042-023-17956-5","volume":"83","author":"J Chen","year":"2024","unstructured":"Chen, J., Zhang, H.: Semantic enhancement and multi-level alignment network for cross-modal retrieval. Multimedia Tools Appl. 83, 88221\u201388243 (2024)","journal-title":"Multimedia Tools Appl."},{"key":"3981_CR37","doi-asserted-by":"publisher","first-page":"292","DOI":"10.1016\/j.future.2023.01.004","volume":"142","author":"R Yu","year":"2023","unstructured":"Yu, R., Jin, F., Qiao, Z., Yuan, Y., Wang, G.: Multi-scale image-text matching network for scene and spatio-temporal images. Futur. Gener. Comput. Syst. 142, 292\u2013300 (2023)","journal-title":"Futur. Gener. Comput. Syst."},{"issue":"3","key":"3981_CR38","doi-asserted-by":"publisher","first-page":"2266","DOI":"10.1002\/cav.2266","volume":"35","author":"W Lin","year":"2024","unstructured":"Lin, W., Zhang, J., Meng, W., Liu, X., Zhang, X.: Hide: hierarchical iterative decoding enhancement for multi-view 3d human parameter regression. Comput. Anim. Virtual Worlds 35(3), 2266 (2024)","journal-title":"Comput. Anim. Virtual Worlds"},{"key":"3981_CR39","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2024.128082","volume":"599","author":"W Wang","year":"2024","unstructured":"Wang, W., Di, X., Liu, M., Gao, F.: Multi-level symmetric semantic alignment network for image-text matching. Neurocomputing 599, 128082 (2024)","journal-title":"Neurocomputing"},{"key":"3981_CR40","doi-asserted-by":"publisher","first-page":"9189","DOI":"10.1109\/TMM.2023.3248160","volume":"25","author":"J Guo","year":"2023","unstructured":"Guo, J., Wang, M., Zhou, Y., Song, B., Chi, Y., Fan, W., Chang, J.: HGAN: hierarchical graph alignment network for image-text retrieval. IEEE Trans. Multimedia 25, 9189\u20139202 (2023)","journal-title":"IEEE Trans. Multimedia"},{"key":"3981_CR41","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2023.102080","volume":"102","author":"L Diao","year":"2024","unstructured":"Diao, L., Tang, X., Wang, J., Xie, G., Hu, J.: Hierarchical visual-semantic interaction for scene text recognition. Inf Fusion 102, 102080 (2024)","journal-title":"Inf Fusion"},{"key":"3981_CR42","doi-asserted-by":"publisher","first-page":"6821","DOI":"10.1109\/TMM.2022.3214776","volume":"25","author":"J Zhu","year":"2022","unstructured":"Zhu, J., Zhang, Q., Fei, L., Cai, R., Xie, Y., Sheng, B., Yang, X.: FFFN: frame-by-frame feedback fusion network for video super-resolution. IEEE Trans. Multimedia 25, 6821\u20136835 (2022)","journal-title":"IEEE Trans. Multimedia"},{"key":"3981_CR43","doi-asserted-by":"crossref","unstructured":"Anderson, P., He, X., Buehler, C., Teney, D., Johnson, M., Gould, S., Zhang, L.: Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6077\u20136086 (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"3981_CR44","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., Zhu, Y., Groth, O., Johnson, J., Hata, K., Kravitz, J., Chen, S., Kalantidis, Y., Li, L.-J., Shamma, D.A.: Visual genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vis. 123, 32\u201373 (2017)","journal-title":"Int. J. Comput. Vis."},{"key":"3981_CR45","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"3981_CR46","doi-asserted-by":"crossref","unstructured":"Chen, J., Hu, H., Wu, H., Jiang, Y., Wang, C.: Learning the best pooling strategy for visual semantic embedding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15789\u201315798 (2021)","DOI":"10.1109\/CVPR46437.2021.01553"},{"key":"3981_CR47","doi-asserted-by":"crossref","unstructured":"Pennington, J., Socher, R., Manning, C.D.: Glove: global vectors for word representation. In: Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 1532\u20131543 (2014)","DOI":"10.3115\/v1\/D14-1162"},{"key":"3981_CR48","unstructured":"Mikolov, T.: Efficient estimation of word representations in vector space. arXiv preprint arXiv:1301.37813781 (2013)"},{"key":"3981_CR49","doi-asserted-by":"crossref","unstructured":"Diao, H., Zhang, Y., Ma, L., Lu, H.: Similarity reasoning and filtration for image-text matching. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 35, pp. 1218\u20131226 (2021)","DOI":"10.1609\/aaai.v35i2.16209"},{"key":"3981_CR50","unstructured":"Karpathy, A., Joulin, A., Fei-Fei, L.F.: Deep fragment embeddings for bidirectional image sentence mapping. In: Advances in Neural Information Processing Systems 27 (2014)"},{"key":"3981_CR51","doi-asserted-by":"crossref","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., Caicedo, J.C., Hockenmaier, J., Lazebnik, S.: Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2641\u20132649 (2015)","DOI":"10.1109\/ICCV.2015.303"},{"key":"3981_CR52","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., Zitnick, C.L.: Microsoft coco: Common objects in context. In: Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6\u201312, 2014, Proceedings, Part V 13, pp. 740\u2013755 (2014). Springer","DOI":"10.1007\/978-3-319-10602-1_48"},{"issue":"11","key":"3981_CR53","doi-asserted-by":"publisher","first-page":"2673","DOI":"10.1109\/78.650093","volume":"45","author":"M Schuster","year":"1997","unstructured":"Schuster, M., Paliwal, K.K.: Bidirectional recurrent neural networks. IEEE Trans. Signal Process. 45(11), 2673\u20132681 (1997)","journal-title":"IEEE Trans. Signal Process."},{"key":"3981_CR54","doi-asserted-by":"crossref","unstructured":"Lee, K.-H., Chen, X., Hua, G., Hu, H., He, X.: Stacked cross attention for image-text matching. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 201\u2013216 (2018)","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"3981_CR55","doi-asserted-by":"publisher","first-page":"8933","DOI":"10.1109\/TMM.2023.3243665","volume":"25","author":"C Chen","year":"2023","unstructured":"Chen, C., Wang, D., Song, B., Tan, H.: Inter-intra modal representation augmentation with DCT-transformer adversarial network for image-text matching. IEEE Trans. Multimedia 25, 8933\u20138945 (2023)","journal-title":"IEEE Trans. Multimedia"},{"issue":"1","key":"3981_CR56","doi-asserted-by":"publisher","DOI":"10.1016\/j.ipm.2022.103154","volume":"60","author":"X Xie","year":"2023","unstructured":"Xie, X., Li, Z., Tang, Z., Yao, D., Ma, H.: Unifying knowledge iterative dissemination and relational reconstruction network for image-text matching. Inf. Process. Manag. 60(1), 103154 (2023)","journal-title":"Inf. Process. Manag."},{"key":"3981_CR57","doi-asserted-by":"crossref","unstructured":"Fu, Z., Mao, Z., Song, Y., Zhang, Y.: Learning semantic relationship among instances for image-text matching. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15159\u201315168 (2023)","DOI":"10.1109\/CVPR52729.2023.01455"},{"key":"3981_CR58","doi-asserted-by":"crossref","unstructured":"Kim, D., Kim, N., Kwak, S.: Improving cross-modal retrieval with set of diverse embeddings. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 23422\u201323431 (2023)","DOI":"10.1109\/CVPR52729.2023.02243"},{"key":"3981_CR59","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2024.111503","volume":"289","author":"X Pu","year":"2024","unstructured":"Pu, X., Chen, Y., Yuan, L., Zhang, Y., Li, H., Jing, L., Gao, X.: MiC: image-text matching in circles with cross-modal generative knowledge enhancement. Knowl.-Based Syst. 289, 111503 (2024)","journal-title":"Knowl.-Based Syst."},{"key":"3981_CR60","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2023.102084","volume":"103","author":"S Huang","year":"2024","unstructured":"Huang, S., Fu, W., Zhang, Z., Liu, S.: Global-local fusion based on adversarial sample generation for image-text matching. Inf. Fusion 103, 102084 (2024)","journal-title":"Inf. Fusion"},{"issue":"7","key":"3981_CR61","doi-asserted-by":"publisher","first-page":"6590","DOI":"10.1109\/TCSVT.2024.3369656","volume":"34","author":"Z Li","year":"2024","unstructured":"Li, Z., Zhang, L., Zhang, K., Zhang, Y., Mao, Z.: Improving image-text matching with bidirectional consistency of cross-modal alignment. IEEE Trans. Circuits Syst. Video Technol. 34(7), 6590\u20136607 (2024)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"10","key":"3981_CR62","doi-asserted-by":"publisher","first-page":"9678","DOI":"10.1109\/TCSVT.2024.3392619","volume":"34","author":"G Xiong","year":"2024","unstructured":"Xiong, G., Meng, M., Zhang, T., Zhang, D., Zhang, Y.: Reference-aware adaptive network for image-text matching. IEEE Trans. Circuits Syst. Video Technol. 34(10), 9678\u20139691 (2024)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"10","key":"3981_CR63","doi-asserted-by":"publisher","first-page":"6131","DOI":"10.1109\/TCSVT.2023.3253548","volume":"33","author":"H Zhu","year":"2023","unstructured":"Zhu, H., Zhang, C., Wei, Y., Huang, S., Zhao, Y.: ESA: external space attention aggregation for image-text retrieval. IEEE Trans. Circuits Syst. Video Technol. 33(10), 6131\u20136143 (2023)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"4","key":"3981_CR64","doi-asserted-by":"publisher","first-page":"2973","DOI":"10.1109\/TCSVT.2023.3307554","volume":"34","author":"K Zhang","year":"2023","unstructured":"Zhang, K., Hu, B., Zhang, H., Li, Z., Mao, Z.: Enhanced semantic similarity learning framework for image-text matching. IEEE Trans. Circuits Syst. Video Technol. 34(4), 2973\u20132988 (2023)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"3981_CR65","doi-asserted-by":"crossref","unstructured":"Pham, K., Huynh, C., Lim, S.-N., Shrivastava, A.: Composing object relations and attributes for image-text matching. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14354\u201314363 (2024)","DOI":"10.1109\/CVPR52733.2024.01361"}],"container-title":["The Visual Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-025-03981-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00371-025-03981-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-025-03981-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,15]],"date-time":"2025-09-15T09:39:01Z","timestamp":1757929141000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00371-025-03981-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,16]]},"references-count":65,"journal-issue":{"issue":"12","published-print":{"date-parts":[[2025,9]]}},"alternative-id":["3981"],"URL":"https:\/\/doi.org\/10.1007\/s00371-025-03981-y","relation":{},"ISSN":["0178-2789","1432-2315"],"issn-type":[{"value":"0178-2789","type":"print"},{"value":"1432-2315","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,6,16]]},"assertion":[{"value":"29 April 2025","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"16 June 2025","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no conflict of interest to declare that are relevant to the content of this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}