{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,31]],"date-time":"2025-12-31T00:11:24Z","timestamp":1767139884459,"version":"build-2238731810"},"reference-count":61,"publisher":"Springer Science and Business Media LLC","issue":"12","license":[{"start":{"date-parts":[[2024,2,15]],"date-time":"2024-02-15T00:00:00Z","timestamp":1707955200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,2,15]],"date-time":"2024-02-15T00:00:00Z","timestamp":1707955200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100017549","name":"Science and Technology Innovation 2025 Major Project of Ningbo","doi-asserted-by":"publisher","award":["2019B10128"],"award-info":[{"award-number":["2019B10128"]}],"id":[{"id":"10.13039\/501100017549","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004731","name":"Natural Science Foundation of Zhejiang Province","doi-asserted-by":"publisher","award":["LY23F020014"],"award-info":[{"award-number":["LY23F020014"]}],"id":[{"id":"10.13039\/501100004731","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Vis Comput"],"published-print":{"date-parts":[[2024,12]]},"DOI":"10.1007\/s00371-024-03274-w","type":"journal-article","created":{"date-parts":[[2024,2,15]],"date-time":"2024-02-15T15:02:21Z","timestamp":1708009341000},"page":"8825-8838","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["ITContrast: contrastive learning with hard negative synthesis for image-text matching"],"prefix":"10.1007","volume":"40","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9618-8965","authenticated-orcid":false,"given":"Fangyu","family":"Wu","sequence":"first","affiliation":[]},{"given":"Qiufeng","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Zhao","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Siyue","family":"Yu","sequence":"additional","affiliation":[]},{"given":"Yushi","family":"Li","sequence":"additional","affiliation":[]},{"given":"Bailing","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Eng Gee","family":"Lim","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,2,15]]},"reference":[{"key":"3274_CR1","unstructured":"Ghosh, M., Roy, S.S., Mukherjee, H., Obaidullah, S.M., Santosh, K., Roy, K.: Understanding movie poster: transfer-deep learning approach for graphic-rich text recognition. The Visual Computer, 1\u201320 (2022)"},{"key":"3274_CR2","doi-asserted-by":"publisher","first-page":"337","DOI":"10.1007\/s00371-016-1335-8","volume":"34","author":"DV Macedo","year":"2018","unstructured":"Macedo, D.V., Rodrigues, M.A.F.: Real-time dynamic reflections for realistic rendering of 3d scenes. Vis. Comput. 34, 337\u2013346 (2018)","journal-title":"Vis. Comput."},{"key":"3274_CR3","doi-asserted-by":"crossref","unstructured":"Junkert, F., Eberts, M., Ulges, A., Schwanecke, U.: Cross-modal image-graphics retrieval by neural transfer learning. In: Proceedings of the 2017 ACM on International Conference on Multimedia Retrieval, pp. 330\u2013337 (2017)","DOI":"10.1145\/3078971.3078994"},{"key":"3274_CR4","doi-asserted-by":"crossref","unstructured":"Wei, X., Zhang, T., Li, Y., Zhang, Y., Wu, F.: Multi-modality cross attention network for image and sentence matching. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10941\u201310950 (2020)","DOI":"10.1109\/CVPR42600.2020.01095"},{"key":"3274_CR5","doi-asserted-by":"crossref","unstructured":"Liu, C., Mao, Z., Zhang, T., Xie, H., Wang, B., Zhang, Y.: Graph structured network for image-text matching. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10921\u201310930 (2020)","DOI":"10.1109\/CVPR42600.2020.01093"},{"key":"3274_CR6","doi-asserted-by":"crossref","unstructured":"Diao, H., Zhang, Y., Ma, L., Lu, H.: Similarity reasoning and filtration for image-text matching. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), pp. 1218\u20131226 (2021)","DOI":"10.1609\/aaai.v35i2.16209"},{"key":"3274_CR7","doi-asserted-by":"publisher","first-page":"1655","DOI":"10.1007\/s00371-018-1565-z","volume":"35","author":"T Jiang","year":"2019","unstructured":"Jiang, T., Zhang, Z., Yang, Y.: Modeling coverage with semantic embedding for image caption generation. Vis. Comput. 35, 1655\u20131665 (2019)","journal-title":"Vis. Comput."},{"key":"3274_CR8","doi-asserted-by":"crossref","unstructured":"Zhou, L., Palangi, H., Zhang, L., Hu, H., Corso, J., Gao, J.: Unified vision-language pre-training for image captioning and vqa. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), pp. 13041\u201313049 (2020)","DOI":"10.1609\/aaai.v34i07.7005"},{"key":"3274_CR9","doi-asserted-by":"crossref","unstructured":"Sun, B., Wu, Y., Zhao, Y., Hao, Z., Yu, L., He, J.: Cross-language multimodal scene semantic guidance and leap sampling for video captioning. Vis. Comput., 1\u201317 (2022)","DOI":"10.1007\/s00371-021-02309-w"},{"issue":"11","key":"3274_CR10","doi-asserted-by":"publisher","first-page":"5783","DOI":"10.1007\/s00371-022-02695-9","volume":"39","author":"Z Guo","year":"2022","unstructured":"Guo, Z., Han, D.: Multi-modal co-attention relation networks for visual question answering. Vis. Comput. 39(11), 5783\u201395 (2022)","journal-title":"Vis. Comput."},{"issue":"9\u201310","key":"3274_CR11","doi-asserted-by":"publisher","first-page":"3097","DOI":"10.1007\/s00371-022-02524-z","volume":"38","author":"F Yan","year":"2022","unstructured":"Yan, F., Silamu, W., Li, Y., Chai, Y.: Spca-net: a based on spatial position relationship co-attention network for visual question answering. Vis. Comput. 38(9\u201310), 3097\u20133108 (2022)","journal-title":"Vis. Comput."},{"key":"3274_CR12","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Lu, H.: Deep cross-modal projection learning for image-text matching. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 686\u2013701 (2018)","DOI":"10.1007\/978-3-030-01246-5_42"},{"key":"3274_CR13","unstructured":"Faghri, F., Fleet, D.J., Kiros, J.R., Fidler, S.: Vse++: Improving visual-semantic embeddings with hard negatives. arXiv preprint arXiv:1707.05612 (2017)"},{"key":"3274_CR14","doi-asserted-by":"crossref","unstructured":"Lee, K.-H., Chen, X., Hua, G., Hu, H., He, X.: Stacked cross attention for image-text matching. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 201\u2013216 (2018)","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"3274_CR15","doi-asserted-by":"crossref","unstructured":"Chen, T., Luo, J.: Expressing objects just like words: Recurrent visual embedding for image-text matching. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), pp. 10583\u201310590 (2020)","DOI":"10.1609\/aaai.v34i07.6631"},{"key":"3274_CR16","doi-asserted-by":"crossref","unstructured":"Li, K., Zhang, Y., Li, K., Li, Y., Fu, Y.: Visual semantic reasoning for image-text matching. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV), pp. 4654\u20134662 (2019)","DOI":"10.1109\/ICCV.2019.00475"},{"key":"3274_CR17","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: Proceedings of the International Conference on Machine Learning (ICML), pp. 1597\u20131607 (2020)"},{"key":"3274_CR18","doi-asserted-by":"crossref","unstructured":"Li, X., Yin, X., Li, C., Zhang, P., et al.: Oscar: Object-semantics aligned pre-training for vision-language tasks. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 121\u2013137 (2020)","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"3274_CR19","doi-asserted-by":"crossref","unstructured":"Feng, Z., Zeng, Z., Guo, C., Li, Z.: Exploiting visual semantic reasoning for video-text retrieval. In: Proceedings of the International Conference on International Joint Conferences on Artificial Intelligence (IJCAI), pp. 1005\u20131011 (2021)","DOI":"10.24963\/ijcai.2020\/140"},{"key":"3274_CR20","doi-asserted-by":"crossref","unstructured":"Wehrmann, J., Kolling, C., Barros, R.C.: Adaptive cross-modal embeddings for image-text alignment. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), pp. 12313\u201312320 (2020)","DOI":"10.1609\/aaai.v34i07.6915"},{"key":"3274_CR21","doi-asserted-by":"crossref","unstructured":"Liu, C., Mao, Z., Liu, A.A., Zhang, T., Wang, B., Zhang, Y.: Focus your attention: A bidirectional focal attention network for image-text matching. In: Proceedings of the ACM International Conference on Multimedia (ACM MM), pp. 3\u201311 (2019)","DOI":"10.1145\/3343031.3350869"},{"key":"3274_CR22","doi-asserted-by":"crossref","unstructured":"Pan, Z., Wu, F., Zhang, B.: Fine-grained image-text matching by cross-modal hard aligning network. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 19275\u201319284 (2023)","DOI":"10.1109\/CVPR52729.2023.01847"},{"key":"3274_CR23","doi-asserted-by":"crossref","unstructured":"Chen, C., Wang, D., Song, B., Tan, H.: Inter-intra modal representation augmentation with dct-transformer adversarial network for image-text matching. IEEE Transactions on Multimedia, 1\u201313 (2023)","DOI":"10.1109\/TMM.2023.3243665"},{"issue":"12","key":"3274_CR24","doi-asserted-by":"publisher","first-page":"5412","DOI":"10.1109\/TNNLS.2020.2967597","volume":"31","author":"X Xu","year":"2020","unstructured":"Xu, X., Wang, T., Yang, Y., Zuo, L., Shen, F., Shen, H.T.: Cross-modal attention with semantic consistence for image-text matching. IEEE Trans. Neural Netw. Learn. Syst. 31(12), 5412\u20135425 (2020)","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"3274_CR25","doi-asserted-by":"crossref","unstructured":"Zhang, K., Mao, Z., Wang, Q., Zhang, Y.: Negative-aware attention framework for image-text matching. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 15661\u201315670 (2022)","DOI":"10.1109\/CVPR52688.2022.01521"},{"key":"3274_CR26","doi-asserted-by":"crossref","unstructured":"Wang, S., Wang, R., Yao, Z., Shan, S., Chen, X.: Cross-modal scene graph matching for relationship-aware image-text retrieval. In: Proceedings of the IEEE Winter Conference on Applications of Computer Vision (WACV), pp. 1508\u20131517 (2020)","DOI":"10.1109\/WACV45572.2020.9093614"},{"key":"3274_CR27","doi-asserted-by":"crossref","unstructured":"Wang, H., Zhang, Y., Ji, Z., Pang, Y., Ma, L.: Consensus-aware visual-semantic embedding for image-text matching. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 18\u201334 (2020)","DOI":"10.1007\/978-3-030-58586-0_2"},{"key":"3274_CR28","doi-asserted-by":"crossref","unstructured":"Zhang, H., Mao, Z., Zhang, K., Zhang, Y.: Show your faith: Cross-modal confidence-aware network for image-text matching. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), vol. 36, pp. 3262\u20133270 (2022)","DOI":"10.1609\/aaai.v36i3.20235"},{"key":"3274_CR29","doi-asserted-by":"crossref","unstructured":"Chen, T., Deng, J., Luo, J.: Adaptive offline quintuplet loss for image-text matching. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 549\u2013565 (2020)","DOI":"10.1007\/978-3-030-58601-0_33"},{"key":"3274_CR30","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., Girshick, R.: Momentum contrast for unsupervised visual representation learning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 9729\u20139738 (2020)","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"3274_CR31","first-page":"18661","volume":"33","author":"P Khosla","year":"2020","unstructured":"Khosla, P., Teterwak, P., Wang, C., Sarna, A., Tian, Y., Isola, P., Maschinot, A., Liu, C., Krishnan, D.: Supervised contrastive learning. Adv. Neural Inform. Process. Syst. NeurIPS 33, 18661\u201373 (2020)","journal-title":"Adv. Neural Inform. Process. Syst. NeurIPS"},{"key":"3274_CR32","doi-asserted-by":"crossref","unstructured":"Li, W., Gao, C., Niu, G., Xiao, X., Liu, H., Liu, J., Wu, H., Wang, H.: Unimo: Towards unified-modal understanding and generation via cross-modal contrastive learning. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Vol. 1: Long Papers), pp. 2592\u20132607 (2021)","DOI":"10.18653\/v1\/2021.acl-long.202"},{"key":"3274_CR33","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning (ICML), pp. 8748\u20138763 (2021). PMLR"},{"key":"3274_CR34","unstructured":"Kenton, J.D.M.-W.C., Toutanova, L.K.: Bert: Pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the North American Chapter of the Association for Computational Linguistics (NAACL-HLT), vol. 1, p. 2 (2019)"},{"key":"3274_CR35","doi-asserted-by":"crossref","unstructured":"Gordo, A., Larlus, D.: Beyond instance-level image retrieval: Leveraging captions to learn a global visual representation for semantic retrieval. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6589\u20136598 (2017)","DOI":"10.1109\/CVPR.2017.560"},{"key":"3274_CR36","unstructured":"Kim, W., Son, B., Kim, I.: Vilt: Vision-and-language transformer without convolution or region supervision. In: International Conference on Machine Learning (ICML)), pp. 5583\u20135594 (2021). PMLR"},{"key":"3274_CR37","first-page":"9694","volume":"34","author":"J Li","year":"2021","unstructured":"Li, J., Selvaraju, R., Gotmare, A., Joty, S., Xiong, C., Hoi, S.C.H.: Align before fuse: Vision and language representation learning with momentum distillation. Adv. Neural. Inf. Process. Syst. 34, 9694\u20139705 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"3274_CR38","doi-asserted-by":"crossref","unstructured":"Zhang, P., Li, X., Hu, X., Yang, J., Zhang, L., Wang, L., Choi, Y., Gao, J.: Vinvl: Revisiting visual representations in vision-language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5579\u20135588 (2021)","DOI":"10.1109\/CVPR46437.2021.00553"},{"issue":"7","key":"3274_CR39","doi-asserted-by":"publisher","first-page":"2933","DOI":"10.1007\/s00371-022-02501-6","volume":"39","author":"R Qiu","year":"2023","unstructured":"Qiu, R., Cai, Z., Chang, Z., Liu, S., Tu, G.: A two-stage image process for water level recognition via dual-attention cornernet and ctransformer. Vis. Comput. 39(7), 2933\u20132952 (2023)","journal-title":"Vis. Comput."},{"key":"3274_CR40","doi-asserted-by":"crossref","unstructured":"Agrawal, H., Desai, K., Wang, Y., Chen, X., Jain, R., Johnson, M., Batra, D., Parikh, D., Lee, S., Anderson, P.: Nocaps: Novel object captioning at scale. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV), pp. 8948\u20138957 (2019)","DOI":"10.1109\/ICCV.2019.00904"},{"key":"3274_CR41","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., Zitnick, C.L.: Microsoft coco: Common objects in context. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 740\u2013755 (2014). Springer","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"3274_CR42","doi-asserted-by":"crossref","unstructured":"Antol, S., Agrawal, A., Lu, J., Mitchell, M., Batra, D., Zitnick, C.L., Parikh, D.: Vqa: Visual question answering. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2425\u20132433 (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"3274_CR43","doi-asserted-by":"crossref","unstructured":"Hudson, D.A., Manning, C.D.: Gqa: A new dataset for real-world visual reasoning and compositional question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 6700\u20136709 (2019)","DOI":"10.1109\/CVPR.2019.00686"},{"key":"3274_CR44","doi-asserted-by":"crossref","unstructured":"Zellers, R., Bisk, Y., Farhadi, A., Choi, Y.: From recognition to cognition: Visual commonsense reasoning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6720\u20136731 (2019)","DOI":"10.1109\/CVPR.2019.00688"},{"key":"3274_CR45","doi-asserted-by":"crossref","unstructured":"Desai, K., Johnson, J.: Virtex: Learning visual representations from textual annotations. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 11162\u201311173 (2021)","DOI":"10.1109\/CVPR46437.2021.01101"},{"key":"3274_CR46","doi-asserted-by":"crossref","unstructured":"Sariyildiz, M.B., Perez, J., Larlus, D.: Learning visual representations with caption annotations. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 153\u2013170 (2020). Springer","DOI":"10.1007\/978-3-030-58598-3_10"},{"issue":"2","key":"3274_CR47","doi-asserted-by":"publisher","first-page":"394","DOI":"10.1109\/TPAMI.2018.2797921","volume":"41","author":"L Wang","year":"2018","unstructured":"Wang, L., Li, Y., Huang, J., Lazebnik, S.: Learning two-branch neural networks for image-text matching tasks. IEEE Trans. Pattern Anal. Mach. Intell. 41(2), 394\u2013407 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"3274_CR48","unstructured":"Zhang, Y., Jiang, H., Miura, Y., Manning, C.D., Langlotz, C.P.: Contrastive learning of medical visual representations from paired images and text. In: Machine Learning for Healthcare Conference, pp. 2\u201325 (2022). PMLR"},{"key":"3274_CR49","unstructured":"Jia, C., Yang, Y., Xia, Y., Chen, Y.T., Parekh, Z., Pham, H., Le, Q., Sung, Y.H., Li, Z., Duerig, T.: Scaling up visual and vision-language representation learning with noisy text supervision. In: International Conference on Machine Learning, pp. 4904\u20134916 (2021). PMLR"},{"key":"3274_CR50","doi-asserted-by":"crossref","unstructured":"Liu, S., Fan, H., Qian, S., Chen, Y., Ding, W., Wang, Z.: Hit: Hierarchical transformer with momentum contrast for video-text retrieval. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV), pp. 11915\u201311925 (2021)","DOI":"10.1109\/ICCV48922.2021.01170"},{"key":"3274_CR51","first-page":"91","volume":"28","author":"S Ren","year":"2015","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: towards real-time object detection with region proposal networks. Adv. Neural Inform. Process. Syst. NeurIPS 28, 91\u201399 (2015)","journal-title":"Adv. Neural Inform. Process. Syst. NeurIPS"},{"issue":"11","key":"3274_CR52","first-page":"2579","volume":"9","author":"L Maaten","year":"2008","unstructured":"Maaten, L., Hinton, G.: Visualizing data using t-sne. J. Mach. Learn. Res. 9(11), 2579\u20132605 (2008)","journal-title":"J. Mach. Learn. Res."},{"key":"3274_CR53","doi-asserted-by":"crossref","unstructured":"Zhang, Q., Lei, Z., Zhang, Z., Li, S.Z.: Context-aware attention network for image-text retrieval. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3536\u20133545 (2020)","DOI":"10.1109\/CVPR42600.2020.00359"},{"key":"3274_CR54","doi-asserted-by":"crossref","unstructured":"Chen, H., Ding, G., Liu, X., Lin, Z., Liu, J., Han, J.: Imram: Iterative matching with recurrent attention memory for cross-modal image-text retrieval. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 12655\u201312663 (2020)","DOI":"10.1109\/CVPR42600.2020.01267"},{"issue":"10","key":"3274_CR55","doi-asserted-by":"publisher","first-page":"6534","DOI":"10.1109\/TPAMI.2021.3088863","volume":"44","author":"J Wei","year":"2021","unstructured":"Wei, J., Yang, Y., Xu, X., Zhu, X., Shen, H.T.: Universal weighting metric learning for cross-modal retrieval. IEEE Trans. Pattern Anal. Mach. Intell. 44(10), 6534\u201345 (2021)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"3274_CR56","doi-asserted-by":"crossref","unstructured":"Li, G., Duan, N., Fang, Y., Gong, M., Jiang, D.: Unicoder-vl: A universal encoder for vision and language by cross-modal pre-training. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), pp. 11336\u201311344 (2020)","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"3274_CR57","doi-asserted-by":"crossref","unstructured":"Chen, Y.C., Li, L., Yu, L., El\u00a0Kholy, A., Ahmed, Z., et al.: Uniter: Universal image-text representation learning. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 104\u2013120 (2020)","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"3274_CR58","doi-asserted-by":"crossref","unstructured":"Chen, J., Hu, H., Wu, H., Jiang, Y., Wang, C.: Learning the best pooling strategy for visual semantic embedding. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 15789\u201315798 (2021)","DOI":"10.1109\/CVPR46437.2021.01553"},{"key":"3274_CR59","doi-asserted-by":"crossref","unstructured":"Miech, A., Alayrac, J.B., Laptev, I., Sivic, J., Zisserman, A.: Thinking fast and slow: Efficient text-to-visual retrieval with transformers. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 9826\u20139836 (2021)","DOI":"10.1109\/CVPR46437.2021.00970"},{"key":"3274_CR60","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young, P., Lai, A., Hodosh, M., Hockenmaier, J.: From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions. Trans. Assoc. Comput. Linguist. 2, 67\u201378 (2014)","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"3274_CR61","doi-asserted-by":"crossref","unstructured":"Mahajan, D., Girshick, R., Ramanathan, V., He, K., Paluri, M., Li, Y., Bharambe, A., Van Der\u00a0Maaten, L.: Exploring the limits of weakly supervised pretraining. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 181\u2013196 (2018)","DOI":"10.1007\/978-3-030-01216-8_12"}],"updated-by":[{"DOI":"10.1007\/s00371-024-03348-9","type":"correction","label":"Correction","source":"publisher","updated":{"date-parts":[[2024,3,11]],"date-time":"2024-03-11T00:00:00Z","timestamp":1710115200000}}],"container-title":["The Visual Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-024-03274-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00371-024-03274-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-024-03274-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,12]],"date-time":"2024-11-12T04:13:35Z","timestamp":1731384815000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00371-024-03274-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,2,15]]},"references-count":61,"journal-issue":{"issue":"12","published-print":{"date-parts":[[2024,12]]}},"alternative-id":["3274"],"URL":"https:\/\/doi.org\/10.1007\/s00371-024-03274-w","relation":{},"ISSN":["0178-2789","1432-2315"],"issn-type":[{"value":"0178-2789","type":"print"},{"value":"1432-2315","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,2,15]]},"assertion":[{"value":"8 January 2024","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 February 2024","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 March 2024","order":3,"name":"change_date","label":"Change Date","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"Correction","order":4,"name":"change_type","label":"Change Type","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"A Correction to this paper has been published:","order":5,"name":"change_details","label":"Change Details","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"https:\/\/doi.org\/10.1007\/s00371-024-03348-9","URL":"https:\/\/doi.org\/10.1007\/s00371-024-03348-9","order":6,"name":"change_details","label":"Change Details","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"We declare that we do not have any commercial or associative interest that represents a conflict of interest in connection with the work submitted.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}