{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,21]],"date-time":"2026-01-21T16:22:59Z","timestamp":1769012579997,"version":"3.49.0"},"reference-count":33,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2024,12,9]],"date-time":"2024-12-09T00:00:00Z","timestamp":1733702400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,9]],"date-time":"2024-12-09T00:00:00Z","timestamp":1733702400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["SIViP"],"published-print":{"date-parts":[[2025,2]]},"DOI":"10.1007\/s11760-024-03721-0","type":"journal-article","created":{"date-parts":[[2024,12,9]],"date-time":"2024-12-09T19:52:31Z","timestamp":1733773951000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Based-CLIP early fusion transformer for image caption"],"prefix":"10.1007","volume":"19","author":[{"given":"Jinyu","family":"Guo","sequence":"first","affiliation":[]},{"given":"Yuejia","family":"Li","sequence":"additional","affiliation":[]},{"given":"Guanghui","family":"Cheng","sequence":"additional","affiliation":[]},{"given":"Wenrui","family":"Li","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,9]]},"reference":[{"key":"3721_CR1","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: a neural image caption generator. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3156\u20133164 (2015)","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"3721_CR2","doi-asserted-by":"crossref","unstructured":"Cornia, M., Baraldi, L., Cucchiara, R.: Show, control and tell: a framework for generating controllable and grounded captions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 8299\u20138308 (2019)","DOI":"10.1109\/CVPR.2019.00850"},{"key":"3721_CR3","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Fei-Fei, L.: Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2015)","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"3721_CR4","unstructured":"Xu, K., Ba, J., Kiros, R., Cho, K., Courville, A., Salakhutdinov, R., Zemel, R.S., Bengio, Y.: Show, attend and tell: Neural image caption generation with visual attention. Computer Science, 2048\u20132057 (2015)"},{"issue":"4","key":"3721_CR5","doi-asserted-by":"publisher","first-page":"652","DOI":"10.1109\/TPAMI.2016.2587640","volume":"39","author":"O Vinyals","year":"2017","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: lessons learned from the 2015 MSCOCO image captioning challenge. IEEE Trans. Pattern Anal. Mach. Intell. 39(4), 652\u2013663 (2017)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"3721_CR6","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, \u0141., Polosukhin, I.: Attention is all you need. Adv.Neural Inf. Process. Syst.30 (2017)"},{"key":"3721_CR7","first-page":"9694","volume":"34","author":"J Li","year":"2021","unstructured":"Li, J., Selvaraju, R., Gotmare, A., Joty, S., Xiong, C., Hoi, S.C.H.: Align before fuse: vision and language representation learning with momentum distillation. Adv. Neural Inf. Process. Syst. 34, 9694\u20139705 (2021)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"3721_CR8","unstructured":"Devlin, J., Chang, M.-W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"3721_CR9","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., : Learning transferable visual models from natural language supervision. In: Proceedings of the International Conference on Machine Learning, pp. 8748\u20138763 (2021)"},{"key":"3721_CR10","doi-asserted-by":"crossref","unstructured":"Lu, J., Xiong, C., Parikh, D., Socher, R.: Knowing when to look: Adaptive attention via a visual sentinel for image captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 375\u2013383 (2017)","DOI":"10.1109\/CVPR.2017.345"},{"key":"3721_CR11","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: Towards real-time object detection with region proposal networks. Adv. Neural Inf. Process. Syst.28 (2015)"},{"key":"3721_CR12","doi-asserted-by":"crossref","unstructured":"Li, Y., Qi, H., Dai, J., Ji, X., Wei, Y.: Fully convolutional instance-aware semantic segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2359\u20132367 (2017)","DOI":"10.1109\/CVPR.2017.472"},{"key":"3721_CR13","doi-asserted-by":"crossref","unstructured":"Johnson, J., Karpathy, A., Fei-Fei, L.: Densecap: fully convolutional localization networks for dense captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4565\u20134574 (2016)","DOI":"10.1109\/CVPR.2016.494"},{"key":"3721_CR14","doi-asserted-by":"crossref","unstructured":"Anderson, P., He, X., Buehler, C., Teney, D., Johnson, M., Gould, S., Zhang, L.: Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6077\u20136086 (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"3721_CR15","doi-asserted-by":"crossref","unstructured":"You, Q., Jin, H., Wang, Z., Fang, C., Luo, J.: Image captioning with semantic attention. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4651\u20134659 (2016)","DOI":"10.1109\/CVPR.2016.503"},{"key":"3721_CR16","doi-asserted-by":"crossref","unstructured":"Cornia, M., Stefanini, M., Baraldi, L., Cucchiara, R.: Meshed-memory transformer for image captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 10578\u201310587 (2020)","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"3721_CR17","doi-asserted-by":"crossref","unstructured":"Li, G., Duan, N., Fang, Y., Gong, M., Jiang, D.: Unicoder-vl: a universal encoder for vision and language by cross-modal pre-training. In: Proceedings of the Conference on Artificial Intelligence, vol. 34, pp. 11336\u201311344 (2020)","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"3721_CR18","doi-asserted-by":"crossref","unstructured":"Chen, Y., Rohrbach, M., Yan, Z., Shuicheng, Y., Feng, J., Kalantidis, Y.: Graph-based global reasoning networks. In: Proceedings of the International Conference on Pattern Recognition, pp. 433\u2013442 (2019)","DOI":"10.1109\/CVPR.2019.00052"},{"key":"3721_CR19","unstructured":"Parmar, N., Vaswani, A., Uszkoreit, J., Kaiser, L., Shazeer, N., Ku, A., Tran, D.: Image transformer. In: Proceedings of the International Conference on Machine Learning, pp. 4055\u20134064 (2018)"},{"key":"3721_CR20","doi-asserted-by":"crossref","unstructured":"Shi, Z., Zhou, X., Qiu, X., Zhu, X.: Improving image captioning with better use of captions. arXiv preprint arXiv:2006.11807 (2020)","DOI":"10.18653\/v1\/2020.acl-main.664"},{"key":"3721_CR21","doi-asserted-by":"crossref","unstructured":"Sahoo, S.K., Chalasani, S., Joshi, A., Iyer, K.N.: Enhancing classification with hierarchical scalable query on fusion transformer. In: 2023 IEEE International Conference on Consumer Electronics (ICCE), pp. 1\u20136 (2023)","DOI":"10.1109\/ICCE56470.2023.10043496"},{"issue":"8","key":"3721_CR22","doi-asserted-by":"publisher","first-page":"4257","DOI":"10.1109\/TCSVT.2023.3243725","volume":"33","author":"J Zhang","year":"2023","unstructured":"Zhang, J., Xie, Y., Ding, W., Wang, Z.: Cross on cross attention: deep fusion transformer for image captioning. IEEE Trans. Circ. Syst. Vid. Technol. 33(8), 4257\u20134268 (2023)","journal-title":"IEEE Trans. Circ. Syst. Vid. Technol."},{"key":"3721_CR23","doi-asserted-by":"crossref","unstructured":"Rennie, S.J., Marcheret, E., Mroueh, Y., Ross, J., Goel, V.: Self-critical sequence training for image captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7008\u20137024 (2017)","DOI":"10.1109\/CVPR.2017.131"},{"key":"3721_CR24","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.-J.: Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"3721_CR25","doi-asserted-by":"crossref","unstructured":"Lin, C.-Y., Och, F.J.: Automatic evaluation of machine translation quality using longest common subsequence and skip-bigram statistics. In: Proceedings of the 42nd Annual Meeting of the Association for Computational Linguistics (ACL-04), pp. 605\u2013612 (2004)","DOI":"10.3115\/1218955.1219032"},{"key":"3721_CR26","doi-asserted-by":"crossref","unstructured":"Denkowski, M., Lavie, A.: Meteor universal: Language specific translation evaluation for any target language. In: Proceedings of the Ninth Workshop on Statistical Machine Translation, pp. 376\u2013380 (2014)","DOI":"10.3115\/v1\/W14-3348"},{"key":"3721_CR27","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence\u00a0Zitnick, C., Parikh, D.: Cider: Consensus-based image description evaluation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"3721_CR28","doi-asserted-by":"crossref","unstructured":"Anderson, P., Fernando, B., Johnson, M., Gould, S.: Spice: semantic propositional image caption evaluation. In: Proceedings of the European Conference on Computer Vision, pp. 382\u2013398 (2016)","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"3721_CR29","unstructured":"Wang, Z., Yu, J., Yu, A.W., Dai, Z., Tsvetkov, Y., Cao, Y.: Simvlm: Simple visual language model pretraining with weak supervision. arXiv preprint arXiv:2108.10904 (2021)"},{"key":"3721_CR30","doi-asserted-by":"crossref","unstructured":"Li, X., Yin, X., Li, C., Zhang, P., Hu, X., Zhang, L., Wang, L., Hu, H., Dong, L., Wei, F., : Oscar: object-semantics aligned pre-training for vision-language tasks. In: Proceedings of the European Conference on Computer Vision, pp. 121\u2013137 (2020)","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"3721_CR31","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning, pp. 12888\u201312900 (2022)"},{"key":"3721_CR32","doi-asserted-by":"crossref","unstructured":"Zhou, L., Palangi, H., Zhang, L., Hu, H., Corso, J., Gao, J.: Unified vision-language pre-training for image captioning and vqa. In: Proceedings of the Conference on Artificial Intelligence, vol. 34, pp. 13041\u201313049 (2020)","DOI":"10.1609\/aaai.v34i07.7005"},{"key":"3721_CR33","doi-asserted-by":"crossref","unstructured":"Hu, X., Gan, Z., Wang, J., Yang, Z., Liu, Z., Lu, Y., Wang, L.: Scaling up vision-language pre-training for image captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 17980\u201317989 (2022)","DOI":"10.1109\/CVPR52688.2022.01745"}],"container-title":["Signal, Image and Video Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-024-03721-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11760-024-03721-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-024-03721-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,31]],"date-time":"2025-01-31T15:01:18Z","timestamp":1738335678000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11760-024-03721-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,9]]},"references-count":33,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2025,2]]}},"alternative-id":["3721"],"URL":"https:\/\/doi.org\/10.1007\/s11760-024-03721-0","relation":{},"ISSN":["1863-1703","1863-1711"],"issn-type":[{"value":"1863-1703","type":"print"},{"value":"1863-1711","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12,9]]},"assertion":[{"value":"3 June 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 August 2024","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 September 2024","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 December 2024","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no Conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"112"}}