{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,25]],"date-time":"2025-12-25T07:27:02Z","timestamp":1766647622540,"version":"3.29.0"},"reference-count":61,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2024,9,6]],"date-time":"2024-09-06T00:00:00Z","timestamp":1725580800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,9,6]],"date-time":"2024-09-06T00:00:00Z","timestamp":1725580800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2024,10]]},"DOI":"10.1007\/s00530-024-01470-1","type":"journal-article","created":{"date-parts":[[2024,9,6]],"date-time":"2024-09-06T14:02:57Z","timestamp":1725631377000},"update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["Exploring coherence from heterogeneous representations for OCR image captioning"],"prefix":"10.1007","volume":"30","author":[{"given":"Yao","family":"Zhang","sequence":"first","affiliation":[]},{"given":"Zijie","family":"Song","sequence":"additional","affiliation":[]},{"given":"Zhenzhen","family":"Hu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,9,6]]},"reference":[{"key":"1470_CR1","doi-asserted-by":"crossref","unstructured":"Sidorov, O., Hu, R., Rohrbach, M., Singh, A.: TextCaps: a dataset for image captioning with reading comprehension. In: European Conference on Computer Vision, pp. 742\u2013758. Springer (2020)","DOI":"10.1007\/978-3-030-58536-5_44"},{"key":"1470_CR2","doi-asserted-by":"crossref","unstructured":"Zhu, Z., Yu, J., Wang, Y., Sun, Y., Hu, Y., Wu, Q.: Mucko: multi-layer cross-modal knowledge reasoning for fact-based visual question answering. arXiv preprint arXiv:2006.09073 (2020)","DOI":"10.24963\/ijcai.2020\/153"},{"key":"1470_CR3","unstructured":"Yu, W., Zhou, J., Yu, W., Liang, X., Xiao, N.: Heterogeneous graph learning for visual commonsense reasoning. In: Advances in neural information processing systems, vol. 32 (2019)"},{"key":"1470_CR4","doi-asserted-by":"crossref","unstructured":"Fan, C., Zhang, X., Zhang, S., Wang, W., Zhang, C., Huang, H.: Heterogeneous memory enhanced multimodal attention model for video question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1999\u20132007 (2019)","DOI":"10.1109\/CVPR.2019.00210"},{"issue":"1","key":"1470_CR5","doi-asserted-by":"publisher","first-page":"207","DOI":"10.1109\/TCSVT.2023.3283430","volume":"34","author":"Y Hu","year":"2023","unstructured":"Hu, Y., Fu, J., Chen, M., Gao, J., Dong, J., Fan, B., Liu, H.: Learning proposal-aware re-ranking for weakly-supervised temporal action localization. IEEE Trans. Circuits Syst. Video Technol. 34(1), 207\u2013220 (2023)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"1470_CR6","doi-asserted-by":"crossref","unstructured":"Cai, D., Qian, S., Fang, Q., Hu, J., Ding, W., Xu, C.: Heterogeneous graph contrastive learning network for personalized micro-video recommendation. IEEE Trans. Multimedia (2022)","DOI":"10.1109\/TMM.2021.3059508"},{"key":"1470_CR7","doi-asserted-by":"crossref","unstructured":"Gao, J., Zhang, T., Xu, C.: I know the relationships: zero-shot action recognition via two-stream graph convolutional networks and knowledge graphs. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 33, pp. 8303\u20138311 (2019)","DOI":"10.1609\/aaai.v33i01.33018303"},{"key":"1470_CR8","doi-asserted-by":"crossref","unstructured":"Hu, Z., Wang, Z., Song, Z., Hong, R.: Dual video summarization: From frames to captions. In: IJCAI, pp. 846\u2013854 (2023)","DOI":"10.24963\/ijcai.2023\/94"},{"key":"1470_CR9","unstructured":"Xu, K., Ba, J., Kiros, R., Cho, K., Courville, A., Salakhudinov, R., Zemel, R., Bengio, Y.: Show, attend and tell: neural image caption generation with visual attention. In: International Conference on Machine Learning, pp. 2048\u20132057. PMLR (2015)"},{"key":"1470_CR10","doi-asserted-by":"crossref","unstructured":"Anderson, P., He, X., Buehler, C., Teney, D., Johnson, M., Gould, S., Zhang, L.: Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6077\u20136086 (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"1470_CR11","doi-asserted-by":"crossref","unstructured":"Huang, L., Wang, W., Chen, J., Wei, X.-Y.: Attention on attention for image captioning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4634\u20134643 (2019)","DOI":"10.1109\/ICCV.2019.00473"},{"key":"1470_CR12","doi-asserted-by":"publisher","first-page":"60","DOI":"10.1016\/j.neucom.2022.01.081","volume":"482","author":"JH Tan","year":"2022","unstructured":"Tan, J.H., Tan, Y.H., Chan, C.S., Chuah, J.H.: ACORT: a compact object relation transformer for parameter efficient image captioning. Neurocomputing 482, 60\u201372 (2022)","journal-title":"Neurocomputing"},{"key":"1470_CR13","doi-asserted-by":"publisher","first-page":"69","DOI":"10.1016\/j.neucom.2022.11.045","volume":"519","author":"N Hu","year":"2023","unstructured":"Hu, N., Fan, C., Ming, Y., Feng, F.: MAENet: a novel multi-head association attention enhancement network for completing intra-modal interaction in image captioning. Neurocomputing 519, 69\u201381 (2023)","journal-title":"Neurocomputing"},{"key":"1470_CR14","doi-asserted-by":"crossref","unstructured":"Wang, J., Tang, J., Yang, M., Bai, X., Luo, J.: Improving OCR-based image captioning by incorporating geometrical relationship. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1306\u20131315 (2021)","DOI":"10.1109\/CVPR46437.2021.00136"},{"key":"1470_CR15","doi-asserted-by":"crossref","unstructured":"Xu, G., Niu, S., Tan, M., Luo, Y., Du, Q., Wu, Q.: Towards accurate text-based image captioning with content diversity exploration. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 12637\u201312646 (2021)","DOI":"10.1109\/CVPR46437.2021.01245"},{"key":"1470_CR16","doi-asserted-by":"crossref","unstructured":"Zhang, W., Shi, H., Guo, J., Zhang, S., Cai, Q., Li, J., Luo, S., Zhuang, Y.: Magic: Multimodal relational graph adversarial inference for diverse and unpaired text-based image captioning. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 36, pp. 3335\u20133343 (2022)","DOI":"10.1609\/aaai.v36i3.20243"},{"key":"1470_CR17","doi-asserted-by":"publisher","first-page":"135","DOI":"10.1162\/tacl_a_00051","volume":"5","author":"P Bojanowski","year":"2017","unstructured":"Bojanowski, P., Grave, E., Joulin, A., Mikolov, T.: Enriching word vectors with subword information. Trans. Assoc. Comput. Linguist. 5, 135\u2013146 (2017)","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"1470_CR18","doi-asserted-by":"crossref","unstructured":"Liu, S., Zhu, Z., Ye, N., Guadarrama, S., Murphy, K.: Improved image captioning via policy gradient optimization of spider. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 873\u2013881 (2017)","DOI":"10.1109\/ICCV.2017.100"},{"key":"1470_CR19","first-page":"91","volume":"28","author":"S Ren","year":"2015","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. Adv. Neural Inf. Process. Syst. 28, 91\u201399 (2015)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"1470_CR20","unstructured":"Devlin, J., Chang, M.-W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"1470_CR21","first-page":"3608","volume":"35","author":"Q Zhu","year":"2021","unstructured":"Zhu, Q., Gao, C., Wang, P., Wu, Q.: Simple is not easy: a simple strong baseline for TextVQA and TextCaps. Proc. AAAI Conf. Artif. Intell. 35, 3608\u20133615 (2021)","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"1470_CR22","doi-asserted-by":"crossref","unstructured":"Yang, Z., Lu, Y., Wang, J., Yin, X., Florencio, D., Wang, L., Zhang, C., Zhang, L., Luo, J.: Tap: text-aware pre-training for Text-VQA and Text-Caption. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8751\u20138761 (2021)","DOI":"10.1109\/CVPR46437.2021.00864"},{"key":"1470_CR23","doi-asserted-by":"crossref","unstructured":"Wang, Z., Bao, R., Wu, Q., Liu, S.: Confidence-aware non-repetitive multimodal transformers for TextCaps. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 35, pp. 2835\u20132843 (2021)","DOI":"10.1609\/aaai.v35i4.16389"},{"issue":"11","key":"1470_CR24","doi-asserted-by":"publisher","first-page":"992","DOI":"10.14778\/3402707.3402736","volume":"4","author":"Y Sun","year":"2011","unstructured":"Sun, Y., Han, J., Yan, X., Yu, P.S., Wu, T.: PathSim: meta path-based top-K similarity search in heterogeneous information networks. Proc. VLDB Endow. 4(11), 992\u20131003 (2011)","journal-title":"Proc. VLDB Endow."},{"key":"1470_CR25","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-01902-9","volume-title":"Mining Heterogeneous Information Networks: Principles and Methodologies","author":"Y Sun","year":"2012","unstructured":"Sun, Y., Han, J.: Mining Heterogeneous Information Networks: Principles and Methodologies. Morgan & Claypool Publishers, San Rafael (2012)"},{"key":"1470_CR26","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: a neural image caption generator. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2015)","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"1470_CR27","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, \u0141., Polosukhin, I.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"1470_CR28","doi-asserted-by":"crossref","unstructured":"Li, G., Zhu, L., Liu, P., Yang, Y.: Entangled transformer for image captioning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV) (2019)","DOI":"10.1109\/ICCV.2019.00902"},{"key":"1470_CR29","unstructured":"Yang, X., Wu, Y., Yang, M., Chen, H., Geng, X.: Exploring diverse in-context configurations for image captioning. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"1470_CR30","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence\u00a0Zitnick, C., Parikh, D.: CIDEr: Consensus-based image description evaluation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"1470_CR31","doi-asserted-by":"crossref","unstructured":"Li, H., Wang, P., Shen, C.: Towards end-to-end text spotting with convolutional recurrent neural networks. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV) (2017)","DOI":"10.1109\/ICCV.2017.560"},{"key":"1470_CR32","doi-asserted-by":"crossref","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: European Conference on Computer Vision, pp. 213\u2013229. Springer (2020)","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"1470_CR33","doi-asserted-by":"crossref","unstructured":"Ye, M., Zhang, J., Zhao, S., Liu, J., Liu, T., Du, B., Tao, D.: DeepSolo: let transformer decoder with explicit points solo for text spotting. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19348\u201319357 (2023)","DOI":"10.1109\/CVPR52729.2023.01854"},{"key":"1470_CR34","doi-asserted-by":"crossref","unstructured":"Huang, M., Zhang, J., Peng, D., Lu, H., Huang, C., Liu, Y., Bai, X., Jin, L.: ESTextSpotter: towards better scene text spotting with explicit synergy in transformer. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 19495\u201319505 (2023)","DOI":"10.1109\/ICCV51070.2023.01786"},{"key":"1470_CR35","doi-asserted-by":"crossref","unstructured":"Wang, J., Tang, J., Luo, J.: Multimodal attention with image text spatial relationship for OCR-based image captioning. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 4337\u20134345 (2020)","DOI":"10.1145\/3394171.3413753"},{"key":"1470_CR36","doi-asserted-by":"publisher","first-page":"318","DOI":"10.1016\/j.neunet.2023.03.010","volume":"162","author":"Q Wang","year":"2023","unstructured":"Wang, Q., Deng, H., Wu, X., Yang, Z., Liu, Y., Wang, Y., Hao, G.: LCM-Captioner: a lightweight text-based image captioning method with collaborative mechanism between vision and text. Neural Netw. 162, 318\u2013329 (2023)","journal-title":"Neural Netw."},{"key":"1470_CR37","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"1470_CR38","doi-asserted-by":"publisher","first-page":"55706","DOI":"10.1109\/ACCESS.2023.3282444","volume":"11","author":"A Ueda","year":"2023","unstructured":"Ueda, A., Yang, W., Sugiura, K.: Switching text-based image encoders for captioning images with text. IEEE Access 11, 55706\u201355715 (2023). https:\/\/doi.org\/10.1109\/ACCESS.2023.3282444","journal-title":"IEEE Access"},{"key":"1470_CR39","doi-asserted-by":"crossref","unstructured":"Zeng, Z., Zhang, H., Lu, R., Wang, D., Chen, B., Wang, Z.: ConZIC: Controllable zero-shot image captioning by sampling-based polishing. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 23465\u201323476 (2023)","DOI":"10.1109\/CVPR52729.2023.02247"},{"key":"1470_CR40","doi-asserted-by":"crossref","unstructured":"Xu, D., Zhao, W., Cai, Y., Huang, Q.: Zero-TextCap: zero-shot framework for text-based image captioning. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 4949\u20134957 (2023)","DOI":"10.1145\/3581783.3612571"},{"key":"1470_CR41","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2023.104751","volume":"136","author":"G Lv","year":"2023","unstructured":"Lv, G., Sun, Y., Nian, F., Zhu, M., Tang, W., Hu, Z.: COME: clip-OCR and master object for text image captioning. Image Vis. Comput. 136, 104751 (2023)","journal-title":"Image Vis. Comput."},{"key":"1470_CR42","doi-asserted-by":"crossref","unstructured":"Chen, L., Li, J., Dong, X., Zhang, P., He, C., Wang, J., Zhao, F., Lin, D.: ShareGPT4V: improving large multi-modal models with better captions. arXiv preprint arXiv:2311.12793 (2023)","DOI":"10.1007\/978-3-031-72643-9_22"},{"key":"1470_CR43","doi-asserted-by":"crossref","unstructured":"Jayaswal, V., Ji, S., Kumar, A., Kumar, V., Prakash, A.: OCR based deep learning approach for image captioning. In: 2024 IEEE International Conference on Computing, Power and Communication Technologies (IC2PCT), vol. 5, pp. 239\u2013244. IEEE (2024)","DOI":"10.1109\/IC2PCT60090.2024.10486670"},{"key":"1470_CR44","doi-asserted-by":"crossref","unstructured":"Wang, N., Xie, J., Wu, J., Jia, M., Li, L.: Controllable image captioning via prompting. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 37, pp. 2617\u20132625 (2023)","DOI":"10.1609\/aaai.v37i2.25360"},{"key":"1470_CR45","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2020.107563","volume":"108","author":"J Yu","year":"2020","unstructured":"Yu, J., Zhu, Z., Wang, Y., Zhang, W., Hu, Y., Tan, J.: Cross-modal knowledge reasoning for knowledge-based visual question answering. Pattern Recognit. 108, 107563 (2020)","journal-title":"Pattern Recognit."},{"issue":"5","key":"1470_CR46","doi-asserted-by":"publisher","first-page":"3017","DOI":"10.1007\/s00530-021-00867-6","volume":"29","author":"Z Song","year":"2023","unstructured":"Song, Z., Hu, Z., Hong, R.: Efficient and self-adaptive rationale knowledge base for visual commonsense reasoning. Multimedia Syst. 29(5), 3017\u20133026 (2023)","journal-title":"Multimedia Syst."},{"key":"1470_CR47","doi-asserted-by":"crossref","unstructured":"Jiang, P., Han, Y.: Reasoning with heterogeneous graph alignment for video question answering. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 34, pp. 11109\u201311116 (2020)","DOI":"10.1609\/aaai.v34i07.6767"},{"key":"1470_CR48","doi-asserted-by":"publisher","first-page":"805","DOI":"10.1109\/TMM.2021.3059508","volume":"24","author":"D Cai","year":"2021","unstructured":"Cai, D., Qian, S., Fang, Q., Xu, C.: Heterogeneous hierarchical feature aggregation network for personalized micro-video recommendation. IEEE Trans. Multimedia 24, 805\u2013818 (2021)","journal-title":"IEEE Trans. Multimedia"},{"key":"1470_CR49","doi-asserted-by":"crossref","unstructured":"Yu, T., Yang, Y., Li, Y., Liu, L., Fei, H., Li, P.: Heterogeneous attention network for effective and efficient cross-modal retrieval. In: Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 1146\u20131156 (2021)","DOI":"10.1145\/3404835.3462924"},{"key":"1470_CR50","doi-asserted-by":"publisher","first-page":"65","DOI":"10.1016\/j.neunet.2022.08.028","volume":"157","author":"J Liu","year":"2023","unstructured":"Liu, J., Song, L., Wang, G., Shang, X.: Meta-HGT: metapath-aware hypergraph transformer for heterogeneous information network embedding. Neural Netw. 157, 65\u201376 (2023)","journal-title":"Neural Netw."},{"key":"1470_CR51","doi-asserted-by":"crossref","unstructured":"Yang, X., Yan, M., Pan, S., Ye, X., Fan, D.: Simple and efficient heterogeneous graph neural network. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 37, pp. 10816\u201310824 (2023)","DOI":"10.1609\/aaai.v37i9.26283"},{"key":"1470_CR52","doi-asserted-by":"crossref","unstructured":"Borisyuk, F., Gordo, A., Sivakumar, V.: Rosetta: Large scale system for text detection and recognition in images. In: Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, pp. 71\u201379 (2018)","DOI":"10.1145\/3219819.3219861"},{"issue":"1","key":"1470_CR53","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., Zhu, Y., Groth, O., Johnson, J., Hata, K., Kravitz, J., Chen, S., Kalantidis, Y., Li, L.-J., Shamma, D.A., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vis. 123(1), 32\u201373 (2017)","journal-title":"Int. J. Comput. Vis."},{"key":"1470_CR54","doi-asserted-by":"crossref","unstructured":"Hu, R., Singh, A., Darrell, T., Rohrbach, M.: Iterative answer prediction with pointer-augmented multimodal transformers for TextVQA. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9992\u201310002 (2020)","DOI":"10.1109\/CVPR42600.2020.01001"},{"key":"1470_CR55","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2024.3384678","author":"Z Song","year":"2024","unstructured":"Song, Z., Hu, Z., Zhou, Y., Zhao, Y., Hong, R., Wang, M.: Embedded heterogeneous attention transformer for cross-lingual image captioning. IEEE Trans. Multimedia (2024). https:\/\/doi.org\/10.1109\/TMM.2024.3384678","journal-title":"IEEE Trans. Multimedia"},{"key":"1470_CR56","doi-asserted-by":"crossref","unstructured":"Tang, W., Hu, Z., Song, Z., Hong, R.: OCR-oriented master object for text image captioning. In: Proceedings of the 2022 International Conference on Multimedia Retrieval, pp. 39\u201343 (2022)","DOI":"10.1145\/3512527.3531431"},{"key":"1470_CR57","doi-asserted-by":"crossref","unstructured":"Schlichtkrull, M., Kipf, T.N., Bloem, P., Van Den\u00a0Berg, R., Titov, I., Welling, M.: Modeling relational data with graph convolutional networks. In: European Semantic Web Conference, pp. 593\u2013607. Springer (2018)","DOI":"10.1007\/978-3-319-93417-4_38"},{"key":"1470_CR58","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.-J.: BLEU: a method for automatic evaluation of machine translation. In: Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"1470_CR59","doi-asserted-by":"crossref","unstructured":"Denkowski, M., Lavie, A.: Meteor universal: language specific translation evaluation for any target language. In: Proceedings of the Ninth Workshop on Statistical Machine Translation, pp. 376\u2013380 (2014)","DOI":"10.3115\/v1\/W14-3348"},{"key":"1470_CR60","unstructured":"Lin, C.-Y.: Rouge: a package for automatic evaluation of summaries. In: Text Summarization Branches Out, pp. 74\u201381 (2004)"},{"key":"1470_CR61","unstructured":"Singh, A., Goswami, V., Natarajan, V., Jiang, Y., Chen, X., Shah, M., Rohrbach, M., Batra, D., Parikh, D.: MMF: a multimodal framework for vision and language research. https:\/\/github.com\/facebookresearch\/mmf (2020)"}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-024-01470-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-024-01470-1\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-024-01470-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,27]],"date-time":"2024-11-27T21:15:32Z","timestamp":1732742132000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-024-01470-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,6]]},"references-count":61,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2024,10]]}},"alternative-id":["1470"],"URL":"https:\/\/doi.org\/10.1007\/s00530-024-01470-1","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"type":"print","value":"0942-4962"},{"type":"electronic","value":"1432-1882"}],"subject":[],"published":{"date-parts":[[2024,9,6]]},"assertion":[{"value":"10 July 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 August 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 September 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"262"}}