{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,17]],"date-time":"2026-01-17T21:55:42Z","timestamp":1768686942136,"version":"3.49.0"},"reference-count":48,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2024,8,29]],"date-time":"2024-08-29T00:00:00Z","timestamp":1724889600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2024,8,29]],"date-time":"2024-08-29T00:00:00Z","timestamp":1724889600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61872267"],"award-info":[{"award-number":["61872267"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61772359"],"award-info":[{"award-number":["61772359"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61472348"],"award-info":[{"award-number":["61472348"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61902277"],"award-info":[{"award-number":["61902277"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61672455"],"award-info":[{"award-number":["61672455"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Significant Science and Technology Project of Ningbo","award":["20211ZDYF020218"],"award-info":[{"award-number":["20211ZDYF020218"]}]},{"DOI":"10.13039\/501100012165","name":"Key Technologies Research and Development Program","doi-asserted-by":"publisher","award":["2020YFB1709201"],"award-info":[{"award-number":["2020YFB1709201"]}],"id":[{"id":"10.13039\/501100012165","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Image Video Proc."],"DOI":"10.1186\/s13640-024-00639-y","type":"journal-article","created":{"date-parts":[[2024,8,29]],"date-time":"2024-08-29T11:03:38Z","timestamp":1724929418000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["A method for image\u2013text matching based on semantic filtering and adaptive adjustment"],"prefix":"10.1186","volume":"2024","author":[{"given":"Ran","family":"Jin","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4900-3718","authenticated-orcid":false,"given":"Tengda","family":"Hou","sequence":"additional","affiliation":[]},{"given":"Tao","family":"Jin","sequence":"additional","affiliation":[]},{"given":"Jie","family":"Yuan","sequence":"additional","affiliation":[]},{"given":"Chenjie","family":"Du","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,8,29]]},"reference":[{"key":"639_CR1","doi-asserted-by":"publisher","unstructured":"J. Chen, H. Hu, H. Wu, Y. Jiang, C. Wang, Learning the best pooling strategy for visual semantic embedding, Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, 15789\u201315798, https:\/\/doi.org\/10.1109\/CVPR46437.2021.01553.","DOI":"10.1109\/CVPR46437.2021.01553"},{"key":"639_CR2","doi-asserted-by":"publisher","unstructured":"T. Xu, X. Liu, Z. Huang, D. Guo, R. Hong, M. Wang, Early-learning regularized contrastive learning for cross-modal retrieval with noisy labels, Proceedings of the 30th ACM International Conference on Multimedia, 629\u2013637, https:\/\/doi.org\/10.1145\/3503161.3548066.","DOI":"10.1145\/3503161.3548066"},{"issue":"11","key":"639_CR3","doi-asserted-by":"publisher","first-page":"7014","DOI":"10.1109\/TSMC.2021.3130939","volume":"52","author":"D Zhang","year":"2022","unstructured":"D. Zhang, X. Wu, T. Xu, J. Kittler, Two-stage supervised discrete hashing for cross-modal retrieval. IEEE Trans. Syst. Man,Cybern 52(11), 7014\u20137026 (2022). https:\/\/doi.org\/10.1109\/TSMC.2021.3130939","journal-title":"IEEE Trans. Syst. Man,Cybern"},{"key":"639_CR4","doi-asserted-by":"publisher","unstructured":"P. Anderson, X. He, C. Buehler, D. Teney, M. Johnson, S. Gould, L. Zhang, Bottom-up and top-down attention for image captioning and visual question answering, Proceedings of the IEEE conference on computer vision and pattern recognition, 6077\u20136086, https:\/\/doi.org\/10.1109\/CVPR.2018.00636.","DOI":"10.1109\/CVPR.2018.00636"},{"key":"639_CR5","doi-asserted-by":"publisher","unstructured":"K. Lee, X. Chen, G. Hua, H. Hu, X. He, Stacked cross attention for image-text matching, Proceedings of the European conference on computer vision (ECCV), 201\u2013216, https:\/\/doi.org\/10.1007\/978-3-030-01225-0_13.","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"639_CR6","doi-asserted-by":"publisher","unstructured":"Z. Wang, X. Liu, H. Li, L. Sheng, J. Yan, X. Wang, J. Shao, Camp: cross-modal adaptive message passing for text-image retrieval, Proceedings of the IEEE\/CVF international conference on computer vision, 5764\u20135773, https:\/\/doi.org\/10.1109\/ICCV.2019.00586.","DOI":"10.1109\/ICCV.2019.00586"},{"key":"639_CR7","doi-asserted-by":"publisher","unstructured":"X. Wei, T. Zhang, Y. Li, Y. Zhang, F. Wu, Multi-modality cross attention network for image and sentence matching, Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, 10941\u201310950, https:\/\/doi.org\/10.1109\/CVPR42600.2020.01095.","DOI":"10.1109\/CVPR42600.2020.01095"},{"key":"639_CR8","doi-asserted-by":"publisher","unstructured":"A. Karpathy, L. Fei-Fei, Deep visual-semantic alignments for generating image descriptions, Proceedings of the IEEE conference on computer vision and pattern recognition, 3128\u20133137, https:\/\/doi.org\/10.1109\/CVPR.2015.7298932.","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"639_CR9","doi-asserted-by":"publisher","unstructured":"H. Chen, G. Ding, X. Liu, Z. Lin, J. Liu, J. Han, Imram: iterative matching with recurrent attention memory for cross-modal image-text retrieval, Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, 12655\u201312663, https:\/\/doi.org\/10.1109\/CVPR42600.2020.01267.","DOI":"10.1109\/CVPR42600.2020.01267"},{"key":"639_CR10","doi-asserted-by":"publisher","unstructured":"C. Liu, Z. Mao, A. Liu, T. Zhang, B. Wang, Y. Zhang, Focus your attention: a bidirectional focal attention network for image-text matching, Proceedings of the 27th ACM international conference on multimedia, 3\u201311, https:\/\/doi.org\/10.1145\/3343031.3350869.","DOI":"10.1145\/3343031.3350869"},{"key":"639_CR11","doi-asserted-by":"publisher","unstructured":"H. Diao, Y. Zhang, L. Ma, H. Lu, Similarity reasoning and filtration for image-text matching, Proceedings of the AAAI Conference on Artificial Intelligence, AAAI Press, California,. 1218\u20131226, https:\/\/doi.org\/10.1609\/aaai.v35i2.16209.","DOI":"10.1609\/aaai.v35i2.16209"},{"issue":"4","key":"639_CR12","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3572844","volume":"19","author":"S Yang","year":"2023","unstructured":"S. Yang, Q. Li, W. Li, X. Li, R. Jin, B. Lv, R. Wang, A. Liu, Semantic completion and filtration for image-text retrieval. ACM Trans. Multimed. Comput. Commun. Appl. 19(4), 1\u201320 (2023). https:\/\/doi.org\/10.1145\/3572844","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl."},{"key":"639_CR13","doi-asserted-by":"publisher","unstructured":"Y. Huang, Q. Wu, C. Song, L. Wang, Learning semantic concepts and order for image and sentence matching, Proceedings of the IEEE conference on computer vision and pattern recognition. 6163\u20136171, https:\/\/doi.org\/10.1109\/CVPR.2018.00645.","DOI":"10.1109\/CVPR.2018.00645"},{"issue":"12","key":"639_CR14","doi-asserted-by":"publisher","first-page":"2639","DOI":"10.1162\/0899766042321814","volume":"16","author":"DR Hardoon","year":"2004","unstructured":"D.R. Hardoon, S. Szedmak, J. Shawe-Taylor, Canonical correlation analysis: an overview with application to learning methods. Neural Comput. 16(12), 2639\u20132664 (2004). https:\/\/doi.org\/10.1162\/0899766042321814","journal-title":"Neural Comput."},{"key":"639_CR15","doi-asserted-by":"publisher","unstructured":"G. Andrew, R. Arora, J. Bilmes, K. Livescu, Deep canonical correlation analysis, International conference on machine learning, PMLR, 1247\u20131255, https:\/\/doi.org\/10.5555\/3042817.3043076.","DOI":"10.5555\/3042817.3043076"},{"key":"639_CR16","doi-asserted-by":"publisher","unstructured":"L. Wang, Y. Li, S. Lazebnik, Learning deep structure-preserving image-text embeddings, Proceedings of the IEEE conference on computer vision and pattern recognition, 5005\u20135013, https:\/\/doi.org\/10.1109\/CVPR.2016.541.","DOI":"10.1109\/CVPR.2016.541"},{"key":"639_CR17","doi-asserted-by":"publisher","unstructured":"R. Kiros, R. Salakhutdinov, R.S. Zemel, Unifying visual-semantic embeddings with multimodal neural language models, arXiv preprint arXiv:1411.2539 (2014), https:\/\/doi.org\/10.1109\/TMM.2022.3141603.","DOI":"10.1109\/TMM.2022.3141603"},{"key":"639_CR18","unstructured":"F. Faghri, D.J. Fleet, J.R. Kiros, S. Fidler, Vse++: improving visual-semantic embeddings with hard negatives, arXiv preprint arXiv:1707.05612 (2017)."},{"key":"639_CR19","doi-asserted-by":"publisher","unstructured":"H. Nam, J. Ha, J. Kim, Dual attention networks for multimodal reasoning and matching, Proceedings of the IEEE conference on computer vision and pattern recognition, 299\u2013307, https:\/\/doi.org\/10.1109\/CVPR.2017.232.","DOI":"10.1109\/CVPR.2017.232"},{"key":"639_CR20","doi-asserted-by":"publisher","unstructured":"T. Wang, X. Xu, Y. Yang, A. Hanjalic, H.T. Shen, J. Song, Matching images and text with multi-modal tensor fusion and re-ranking, Proceedings of the 27th ACM international conference on multimedia, 12\u201320, https:\/\/doi.org\/10.1145\/3343031.3350875.","DOI":"10.1145\/3343031.3350875"},{"issue":"7","key":"639_CR21","doi-asserted-by":"publisher","first-page":"2250","DOI":"10.1109\/TCSVT.2019.2916167","volume":"30","author":"L Ma","year":"2019","unstructured":"L. Ma, W. Jiang, Z. Jie, Y. Jiang, W. Liu, Matching image and sentence with multi-faceted representations. Ieee Trans. Circuits Syst. Video Technol. 30(7), 2250\u20132261 (2019). https:\/\/doi.org\/10.1109\/TCSVT.2019.2916167","journal-title":"Ieee Trans. Circuits Syst. Video Technol."},{"issue":"11","key":"639_CR22","doi-asserted-by":"publisher","first-page":"139","DOI":"10.1145\/3422622","volume":"63","author":"I Goodfellow","year":"2020","unstructured":"I. Goodfellow, J. Pouget-Abadie, M. Mirza, B. Xu, D. Warde-Farley, S. Ozair, A. Courville, Y. Bengio, Generative adversarial networks. Commun. Acm 63(11), 139\u2013144 (2020). https:\/\/doi.org\/10.1145\/3422622","journal-title":"Commun. Acm"},{"key":"639_CR23","doi-asserted-by":"publisher","unstructured":"B. Wang, Y. Yang, X. Xu, A. Hanjalic, H.T. Shen, Adversarial cross-modal retrieval, Proceedings of the 25th ACM international conference on Multimedia, 154\u2013162, https:\/\/doi.org\/10.1145\/3123266.3123326.","DOI":"10.1145\/3123266.3123326"},{"issue":"1","key":"639_CR24","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3284750","volume":"15","author":"Y Peng","year":"2019","unstructured":"Y. Peng, J. Qi, Cm-gans: cross-modal generative adversarial networks for common representation learning, ACM Transactions on Multimedia Computing, Communications, and Applications. ACM Trans. Multimedia Comput. Commun. Appl 15(1), 1\u201324 (2019). https:\/\/doi.org\/10.1145\/3284750","journal-title":"ACM Trans. Multimedia Comput. Commun. Appl"},{"key":"639_CR25","doi-asserted-by":"publisher","unstructured":"J. Gu, J. Cai, S.R. Joty, L. Niu, G. Wang, Look, imagine and match: improving textual-visual cross-modal retrieval with generative models, Proceedings of the IEEE conference on computer vision and pattern recognition, 7181\u20137189, https:\/\/doi.org\/10.1109\/CVPR.2018.00750.","DOI":"10.1109\/CVPR.2018.00750"},{"issue":"12","key":"639_CR26","doi-asserted-by":"publisher","first-page":"3034","DOI":"10.1109\/TCSVT.2019.2953692","volume":"40","author":"F Shen","year":"2018","unstructured":"F. Shen, Y. Xu, L. Liu, Y. Yang, Z. Huang, H.T. Shen, Unsupervised deep hashing with similarity-adaptive and discrete optimization. Ieee Trans. Pattern Anal. Mach. Intell. 40(12), 3034\u20133044 (2018). https:\/\/doi.org\/10.1109\/TCSVT.2019.2953692","journal-title":"Ieee Trans. Pattern Anal. Mach. Intell."},{"issue":"4","key":"639_CR27","doi-asserted-by":"publisher","first-page":"1473","DOI":"10.1109\/TCYB.2018.2882908","volume":"50","author":"E Yang","year":"2018","unstructured":"E. Yang, T. Liu, C. Deng, D. Tao, Adversarial examples for hamming space search. Ieee T. Cybern. 50(4), 1473\u20131484 (2018). https:\/\/doi.org\/10.1109\/TCYB.2018.2882908","journal-title":"Ieee T. Cybern."},{"issue":"5","key":"639_CR28","doi-asserted-by":"publisher","first-page":"1059","DOI":"10.1109\/TPAMI.2016.2645565","volume":"40","author":"F Zheng","year":"2016","unstructured":"F. Zheng, Y. Tang, L. Shao, Hetero-manifold regularisation for cross-modal hashing. Ieee Trans. Pattern Anal. Mach. Intell. 40(5), 1059\u20131071 (2016). https:\/\/doi.org\/10.1109\/TPAMI.2016.2645565","journal-title":"Ieee Trans. Pattern Anal. Mach. Intell."},{"issue":"8","key":"639_CR29","doi-asserted-by":"publisher","first-page":"3893","DOI":"10.1109\/TIP.2018.2821921","volume":"27","author":"C Deng","year":"2018","unstructured":"C. Deng, Z. Chen, X. Liu, X. Gao, D. Tao, Triplet-based deep hashing network for cross-modal retrieval. Ieee Trans. Image Process. 27(8), 3893\u20133903 (2018). https:\/\/doi.org\/10.1109\/TIP.2018.2821921","journal-title":"Ieee Trans. Image Process."},{"issue":"11","key":"639_CR30","doi-asserted-by":"publisher","first-page":"5292","DOI":"10.1109\/TNNLS.2018.2793863","volume":"29","author":"E Yang","year":"2018","unstructured":"E. Yang, C. Deng, C. Li, W. Liu, J. Li, D. Tao, Shared predictive cross-modal deep quantization. Ieee Trans. Neural Netw. Learn. Syst. 29(11), 5292\u20135303 (2018). https:\/\/doi.org\/10.1109\/TNNLS.2018.2793863","journal-title":"Ieee Trans. Neural Netw. Learn. Syst."},{"key":"639_CR31","doi-asserted-by":"publisher","unstructured":"Z. Hu, Y. Luo, J. Lin, Y. Yan, J. Chen, Multi-level visual-semantic alignments with relation-wise dual attention network for image and text matching. IJCAI. 789\u2013795, https:\/\/doi.org\/10.24963\/ijcai.2019\/111.","DOI":"10.24963\/ijcai.2019\/111"},{"key":"639_CR32","doi-asserted-by":"publisher","unstructured":"Y. Wang, H. Yang, X. Qian, L. Ma, J. Lu, B. Li, X. Fan, Position focused attention network for image-text matching, arXiv preprint arXiv:1907.09748 (2019), https:\/\/doi.org\/10.24963\/ijcai.2019\/526.","DOI":"10.24963\/ijcai.2019\/526"},{"key":"639_CR33","doi-asserted-by":"publisher","unstructured":"J. Wehrmann, D.M. Souza, M.A. Lopes, R.C. Barros, Language-agnostic visual-semantic embeddings, Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5804\u20135813, https:\/\/doi.org\/10.1109\/ICCV.2019.00590.","DOI":"10.1109\/ICCV.2019.00590"},{"key":"639_CR34","doi-asserted-by":"publisher","unstructured":"Q. Zhang, Z. Lei, Z. Zhang, S.Z. Li, Context-aware attention network for image-text retrieval, Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 3536\u20133545, https:\/\/doi.org\/10.1109\/CVPR42600.2020.00359.","DOI":"10.1109\/CVPR42600.2020.00359"},{"issue":"6","key":"639_CR35","doi-asserted-by":"publisher","first-page":"1137","DOI":"10.1109\/TPAMI.2016.2577031","volume":"39","author":"S Ren","year":"2017","unstructured":"S. Ren, K. He, R. Girshick, J. Sun, Faster r-cnn: towards real-time object detection with region proposal networks. IEEE Trans. Pattern Anal. Mach. Intell. 39(6), 1137\u20131149 (2017). https:\/\/doi.org\/10.1109\/TPAMI.2016.2577031","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"639_CR36","doi-asserted-by":"publisher","first-page":"1320","DOI":"10.1109\/TMM.2022.3141603","volume":"25","author":"K Zhang","year":"2023","unstructured":"K. Zhang, Z. Mao, A. Liu, Unified adaptive relevance distinguishable attention network for image-text matching. Ieee Trans. Multimedia 25, 1320\u20131332 (2023)","journal-title":"Ieee Trans. Multimedia"},{"key":"639_CR37","doi-asserted-by":"publisher","unstructured":"R. Zellers, M. Yatskar, S. Thomson, Y. Choi, Neural motifs: scene graph parsing with global context, Proceedings of the IEEE conference on computer vision and pattern recognition. 5831\u20135840, https:\/\/doi.org\/10.1109\/CVPR.2018.00611.","DOI":"10.1109\/CVPR.2018.00611"},{"key":"639_CR38","doi-asserted-by":"publisher","unstructured":"S. Wang, R. Wang, Z. Yao, S. Shan, X. Chen, Cross-modal scene graph matching for relationship-aware image-text retrieval, Proceedings of the IEEE\/CVF winter conference on applications of computer vision, 1508\u20131517, https:\/\/doi.org\/10.1109\/WACV45572.2020.9093614.","DOI":"10.1109\/WACV45572.2020.9093614"},{"issue":"11","key":"639_CR39","doi-asserted-by":"publisher","first-page":"4368","DOI":"10.1109\/TCSVT.2019.2953692","volume":"30","author":"Y Peng","year":"2019","unstructured":"Y. Peng, J. Chi, Unsupervised cross-media retrieval using domain adaptation with scene graph. Ieee Trans. Circuits Syst. Video Technol. 30(11), 4368\u20134379 (2019). https:\/\/doi.org\/10.1109\/TCSVT.2019.2953692","journal-title":"Ieee Trans. Circuits Syst. Video Technol."},{"key":"639_CR40","doi-asserted-by":"publisher","first-page":"36","DOI":"10.1016\/j.neucom.2018.11.089","volume":"345","author":"L Ma","year":"2019","unstructured":"L. Ma, W. Jiang, Z. Jie, X. Wang, Bidirectional image-sentence retrieval by local and global deep matching. Neurocomputing 345, 36\u201344 (2019). https:\/\/doi.org\/10.1016\/j.neucom.2018.11.089","journal-title":"Neurocomputing"},{"key":"639_CR41","doi-asserted-by":"publisher","first-page":"21847","DOI":"10.1109\/ACCESS.2020.2969808","volume":"8","author":"Z Li","year":"2020","unstructured":"Z. Li, F. Ling, C. Zhang, H. Ma, Combining global and local similarity for cross-media retrieval. Ieee Access 8, 21847\u201321856 (2020). https:\/\/doi.org\/10.1109\/ACCESS.2020.2969808","journal-title":"Ieee Access"},{"issue":"12","key":"639_CR42","doi-asserted-by":"publisher","first-page":"5412","DOI":"10.1109\/TNNLS.2020.2967597","volume":"31","author":"X Xu","year":"2020","unstructured":"X. Xu, T. Wang, Y. Yang, L. Zuo, F. Shen, H.T. Shen, Cross-modal attention with semantic consistence for image\u2013text matching. Ieee Trans. Neural Netw. Learn. Syst. 31(12), 5412\u20135425 (2020). https:\/\/doi.org\/10.1109\/TNNLS.2020.2967597","journal-title":"Ieee Trans. Neural Netw. Learn. Syst."},{"key":"639_CR43","doi-asserted-by":"publisher","unstructured":"M. Luong, H. Pham, C.D. Manning, Effective approaches to attention-based neural machine translation, arXiv preprint arXiv:1508.04025 (2015), https:\/\/doi.org\/10.18653\/v1\/D15-1166.","DOI":"10.18653\/v1\/D15-1166"},{"key":"639_CR44","unstructured":"K. Xu, J. Ba, R. Kiros, K. Cho, A. Courville, R. Salakhudinov, R. Zemel, Y. Bengio, Show, attend and tell: neural image caption generation with visual attention, International conference on machine learning, PMLR. 2048\u20132057."},{"key":"639_CR45","doi-asserted-by":"publisher","unstructured":"B.A. Plummer, L. Wang, C.M. Cervantes, J.C. Caicedo, J. Hockenmaier, S. Lazebnik, Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models, Proceedings of the IEEE international conference on computer vision. 2641\u20132649, https:\/\/doi.org\/10.1109\/ICCV.2015.303.","DOI":"10.1109\/ICCV.2015.303"},{"key":"639_CR46","first-page":"740","volume-title":"Microsoft coco common objects in context, Computer Vision ICCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13","author":"T Lin","year":"2014","unstructured":"T. Lin, M. Maire, S. Belongie, J. Hays, P. Perona, D. Ramanan, P. Dollmr, C.L. Zitnick, Microsoft coco common objects in context, Computer Vision ICCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13 (Springer, Cham, 2014), pp.740\u2013755"},{"key":"639_CR47","unstructured":"A. Radford, J.W. Kim, C. Hallacy, A. Ramesh, G. Goh, S. Agarwal, G. Sastry, A. Askell, P. Mishkin, J. Clark, Learning transferable visual models from natural language supervision, International conference on machine learning, PMLR. 8748\u20138763."},{"key":"639_CR48","unstructured":"Z. Zeng, W. Mao, A comprehensive empirical study of vision-language pre-trained model for supervised cross-modal retrieval, arXiv preprint arXiv:2201.02772 (2022)."}],"container-title":["EURASIP Journal on Image and Video Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1186\/s13640-024-00639-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1186\/s13640-024-00639-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1186\/s13640-024-00639-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,29]],"date-time":"2024-08-29T11:10:50Z","timestamp":1724929850000},"score":1,"resource":{"primary":{"URL":"https:\/\/jivp-eurasipjournals.springeropen.com\/articles\/10.1186\/s13640-024-00639-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,29]]},"references-count":48,"journal-issue":{"issue":"1","published-online":{"date-parts":[[2024,12]]}},"alternative-id":["639"],"URL":"https:\/\/doi.org\/10.1186\/s13640-024-00639-y","relation":{},"ISSN":["1687-5281"],"issn-type":[{"value":"1687-5281","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,8,29]]},"assertion":[{"value":"31 August 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 August 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 August 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing of interests"}}],"article-number":"23"}}