{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,14]],"date-time":"2026-05-14T18:08:46Z","timestamp":1778782126412,"version":"3.51.4"},"reference-count":75,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62276073"],"award-info":[{"award-number":["62276073"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004607","name":"Natural Science Foundation of Guangxi Province","doi-asserted-by":"publisher","award":["2026GXNSFAA00641057"],"award-info":[{"award-number":["2026GXNSFAA00641057"]}],"id":[{"id":"10.13039\/501100004607","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Engineering Applications of Artificial Intelligence"],"published-print":{"date-parts":[[2026,7]]},"DOI":"10.1016\/j.engappai.2026.114652","type":"journal-article","created":{"date-parts":[[2026,3,30]],"date-time":"2026-03-30T17:43:55Z","timestamp":1774892635000},"page":"114652","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Improving Transformer-based image captioning via deep collaborative cross-fusion"],"prefix":"10.1016","volume":"175","author":[{"given":"Junbo","family":"Hu","sequence":"first","affiliation":[]},{"given":"Tong","family":"Wu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5313-6134","authenticated-orcid":false,"given":"Zhixin","family":"Li","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.engappai.2026.114652_b1","doi-asserted-by":"crossref","first-page":"5984","DOI":"10.1109\/TMM.2022.3202690","article-title":"Image captioning with novel topics guidance and retrieval-based topics re-weighting","volume":"25","author":"Al-Qatf","year":"2022","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.engappai.2026.114652_b2","doi-asserted-by":"crossref","unstructured":"Anderson, P., Fernando, B., Johnson, M., Gould, S., 2016. Spice: Semantic propositional image caption evaluation. In: Proceedings of the European Conference on Computer Vision. pp. 382\u2013398.","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"10.1016\/j.engappai.2026.114652_b3","doi-asserted-by":"crossref","unstructured":"Anderson, P., He, X., Buehler, C., Teney, D., Johnson, M., Gould, S., Zhang, L., 2018. Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 6077\u20136086.","DOI":"10.1109\/CVPR.2018.00636"},{"key":"10.1016\/j.engappai.2026.114652_b4","unstructured":"Banerjee, S., Lavie, A., 2005. METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In: Proceedings of the Acl Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/Or Summarization. pp. 65\u201372."},{"key":"10.1016\/j.engappai.2026.114652_b5","series-title":"Proceedings of the International Conference on Pattern Recognition","first-page":"4087","article-title":"CaMEL: mean teacher learning for image captioning","author":"Barraco","year":"2022"},{"key":"10.1016\/j.engappai.2026.114652_b6","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.110555","article-title":"CAST: Cross-modal retrieval and visual conditioning for image captioning","volume":"153","author":"Cao","year":"2024","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.engappai.2026.114652_b7","doi-asserted-by":"crossref","first-page":"120","DOI":"10.1016\/j.neunet.2021.11.017","article-title":"Event-centric multi-modal fusion method for dense video captioning","volume":"146","author":"Chang","year":"2022","journal-title":"Neural Netw."},{"key":"10.1016\/j.engappai.2026.114652_b8","unstructured":"Chin-Yew, L., 2004. Rouge: A package for automatic evaluation of summaries. In: Proceedings of the Workshop on Text Summarization Branches Out. pp. 74\u201381."},{"key":"10.1016\/j.engappai.2026.114652_b9","doi-asserted-by":"crossref","DOI":"10.1016\/j.engappai.2024.109948","article-title":"Envqa: Improving visual question answering model by enriching the visual feature","volume":"142","author":"Chowdhury","year":"2025","journal-title":"Eng. Appl. Artif. Intell."},{"key":"10.1016\/j.engappai.2026.114652_b10","doi-asserted-by":"crossref","unstructured":"Cornia, M., Stefanini, M., Baraldi, L., Cucchiara, R., 2020. Meshed-memory transformer for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 10578\u201310587.","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"10.1016\/j.engappai.2026.114652_b11","doi-asserted-by":"crossref","DOI":"10.1016\/j.engappai.2025.110997","article-title":"Understanding question-answering systems: Evolution, applications, trends, and challenges","volume":"156","author":"Farea","year":"2025","journal-title":"Eng. Appl. Artif. Intell."},{"key":"10.1016\/j.engappai.2026.114652_b12","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2024.125847","article-title":"Multi-granularity semantic relational mapping for image caption","volume":"264","author":"Gao","year":"2025","journal-title":"Expert Syst. Appl."},{"key":"10.1016\/j.engappai.2026.114652_b13","doi-asserted-by":"crossref","unstructured":"Guo, L., Liu, J., Zhu, X., Yao, P., Lu, S., Lu, H., 2020. Normalized and geometry-aware self-attention network for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 10327\u201310336.","DOI":"10.1109\/CVPR42600.2020.01034"},{"key":"10.1016\/j.engappai.2026.114652_b14","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J., 2016. Deep residual learning for image recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 770\u2013778.","DOI":"10.1109\/CVPR.2016.90"},{"key":"10.1016\/j.engappai.2026.114652_b15","first-page":"11137","article-title":"Image captioning: Transforming objects into words","volume":"32","author":"Herdade","year":"2019","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.engappai.2026.114652_b16","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"17980","article-title":"Scaling up vision-language pre-training for image captioning","author":"Hu","year":"2022"},{"key":"10.1016\/j.engappai.2026.114652_b17","doi-asserted-by":"crossref","unstructured":"Hu, J., Li, Z., 2024. Distilled Cross-Combination Transformer for Image Captioning with Dual Refined Visual Features. In: Proceedings of the 32nd ACM International Conference on Multimedia. pp. 4465\u20134474.","DOI":"10.1145\/3664647.3681161"},{"key":"10.1016\/j.engappai.2026.114652_b18","doi-asserted-by":"crossref","DOI":"10.1016\/j.neunet.2024.106710","article-title":"Exploring refined dual visual features cross-combination for image captioning","volume":"180","author":"Hu","year":"2024","journal-title":"Neural Netw."},{"key":"10.1016\/j.engappai.2026.114652_b19","doi-asserted-by":"crossref","unstructured":"Huang, L., Wang, W., Chen, J., Wei, X.-Y., 2019. Attention on attention for image captioning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 4634\u20134643.","DOI":"10.1109\/ICCV.2019.00473"},{"key":"10.1016\/j.engappai.2026.114652_b20","doi-asserted-by":"crossref","first-page":"7706","DOI":"10.1109\/TCSVT.2022.3181490","article-title":"Double-stream position learning transformer network for image captioning","volume":"32","author":"Jiang","year":"2022","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.engappai.2026.114652_b21","doi-asserted-by":"crossref","DOI":"10.1016\/j.engappai.2025.111654","article-title":"Aspect-based sentiment analysis with semantic and syntactic enhanced multi-layer fusion model","volume":"159","author":"Jin","year":"2025","journal-title":"Eng. Appl. Artif. Intell."},{"key":"10.1016\/j.engappai.2026.114652_b22","doi-asserted-by":"crossref","first-page":"2367","DOI":"10.1109\/TMM.2023.3295098","article-title":"Memory-based augmentation network for video captioning","volume":"26","author":"Jing","year":"2023","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.engappai.2026.114652_b23","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Fei-Fei, L., 2015. Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 3128\u20133137.","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"10.1016\/j.engappai.2026.114652_b24","first-page":"1","article-title":"Exploring visual relationships via transformer-based graphs for enhanced image captioning","volume":"20","author":"Li","year":"2024","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl."},{"key":"10.1016\/j.engappai.2026.114652_b25","doi-asserted-by":"crossref","unstructured":"Li, Y., Pan, Y., Yao, T., Mei, T., 2022. Comprehending and ordering semantics for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 17990\u201317999.","DOI":"10.1109\/CVPR52688.2022.01746"},{"key":"10.1016\/j.engappai.2026.114652_b26","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2023.104864","article-title":"External knowledge-assisted transformer for image captioning","volume":"140","author":"Li","year":"2023","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.engappai.2026.114652_b27","doi-asserted-by":"crossref","DOI":"10.1016\/j.engappai.2025.110358","article-title":"Dynamic window sampling strategy for image captioning","volume":"148","author":"Li","year":"2025","journal-title":"Eng. Appl. Artif. Intell."},{"key":"10.1016\/j.engappai.2026.114652_b28","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., Zitnick, C.L., 2014. Microsoft coco: Common objects in context. In: Proceedings of the European Conference on Computer Vision. pp. 740\u2013755.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"10.1016\/j.engappai.2026.114652_b29","doi-asserted-by":"crossref","unstructured":"Lu, J., Xiong, C., Parikh, D., Socher, R., 2017. Knowing when to look: Adaptive attention via a visual sentinel for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 375\u2013383.","DOI":"10.1109\/CVPR.2017.345"},{"key":"10.1016\/j.engappai.2026.114652_b30","doi-asserted-by":"crossref","unstructured":"Luo, Y., Ji, J., Sun, X., Cao, L., Wu, Y., Huang, F., Lin, C.-W., Ji, R., 2021. Dual-level collaborative transformer for image captioning. In: Proceedings of the AAAI Conference on Artificial Intelligence. pp. 2286\u20132293.","DOI":"10.1609\/aaai.v35i3.16328"},{"key":"10.1016\/j.engappai.2026.114652_b31","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2023.109420","article-title":"Towards local visual modeling for image captioning","volume":"138","author":"Ma","year":"2023","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.engappai.2026.114652_b32","doi-asserted-by":"crossref","first-page":"11778","DOI":"10.1038\/s41598-025-96523-4","article-title":"Hybrid vision GNNs based early detection and protection against pest diseases in coffee plants","volume":"15","author":"Maruthai","year":"2025","journal-title":"Sci. Rep."},{"issue":"2","key":"10.1016\/j.engappai.2026.114652_b33","first-page":"503","article-title":"Human complementation must aid automation to mitigate unemployment effects due to AI technologies in the labor market","volume":"5","author":"\u00d6zer","year":"2024","journal-title":"REFLEKTi\u0307f Sos. Bilim. Derg."},{"issue":"3","key":"10.1016\/j.engappai.2026.114652_b34","first-page":"1105","article-title":"Creative alienation in art due to artificial intelligence","volume":"6","author":"\u00d6zer","year":"2025","journal-title":"REFLEKTi\u0307f Sos. Bilim. Derg."},{"key":"10.1016\/j.engappai.2026.114652_b35","doi-asserted-by":"crossref","unstructured":"Pan, Y., Yao, T., Li, Y., Mei, T., 2020. X-linear attention networks for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 10971\u201310980.","DOI":"10.1109\/CVPR42600.2020.01098"},{"key":"10.1016\/j.engappai.2026.114652_b36","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.-J., 2002. Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics. pp. 311\u2013318.","DOI":"10.3115\/1073083.1073135"},{"key":"10.1016\/j.engappai.2026.114652_b37","doi-asserted-by":"crossref","DOI":"10.1016\/j.neunet.2024.106560","article-title":"Robust visual question answering via polarity enhancement and contrast","volume":"179","author":"Peng","year":"2024","journal-title":"Neural Netw."},{"key":"10.1016\/j.engappai.2026.114652_b38","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2025.111394","article-title":"Unbiased VQA via modal information interaction and question transformation","volume":"162","author":"Peng","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.engappai.2026.114652_b39","doi-asserted-by":"crossref","unstructured":"Prabu, S., Saravanan, R., Gomathi, T., Surendran, R., Deepa, R., 2025. Generative Adversarial Networks for Endoscopic Image Classification. In: Proceedings of International Conference on Computing Technologies & Data Communication. pp. 1\u20137.","DOI":"10.1109\/ICCTDC64446.2025.11158984"},{"key":"10.1016\/j.engappai.2026.114652_b40","doi-asserted-by":"crossref","unstructured":"Pramila, R.P., Subhashini, R., et al., 2024. Synergistic Deep Learning Pipeline for Dermatological Image Classification: Cascaded U-Net Denoising and Feature-Selective CNN with Fused Feature Extraction. In: Proceedings of International Conference on Engineering Technologies and Applied Sciences. pp. 1\u20136.","DOI":"10.1109\/ICETAS62372.2024.11120165"},{"key":"10.1016\/j.engappai.2026.114652_b41","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et al., 2021. Learning transferable visual models from natural language supervision. In: Proceedings of the International Conference on Machine Learning. pp. 8748\u20138763."},{"key":"10.1016\/j.engappai.2026.114652_b42","doi-asserted-by":"crossref","unstructured":"Rennie, S.J., Marcheret, E., Mroueh, Y., Ross, J., Goel, V., 2017. Self-critical sequence training for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 7008\u20137024.","DOI":"10.1109\/CVPR.2017.131"},{"key":"10.1016\/j.engappai.2026.114652_b43","doi-asserted-by":"crossref","unstructured":"Seo, P.H., Nagrani, A., Arnab, A., Schmid, C., 2022. End-to-end generative pretraining for multimodal video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 17959\u201317968.","DOI":"10.1109\/CVPR52688.2022.01743"},{"key":"10.1016\/j.engappai.2026.114652_b44","series-title":"Very deep convolutional networks for large-scale image recognition","author":"Simonyan","year":"2014"},{"key":"10.1016\/j.engappai.2026.114652_b45","doi-asserted-by":"crossref","first-page":"1","DOI":"10.47163\/agrociencia.v59i5.3380","article-title":"Al-biruni earth radius optimization for enhanced environmental data analysis in remote sensing imagery","author":"Sivasubramanian","year":"2025","journal-title":"Agrociencia"},{"key":"10.1016\/j.engappai.2026.114652_b46","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence Zitnick, C., Parikh, D., 2015. Cider: Consensus-based image description evaluation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 4566\u20134575.","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"10.1016\/j.engappai.2026.114652_b47","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"3156","article-title":"Show and tell: A neural image caption generator","author":"Vinyals","year":"2015"},{"key":"10.1016\/j.engappai.2026.114652_b48","doi-asserted-by":"crossref","unstructured":"Wang, W., Chen, Z., Hu, H., 2019. Hierarchical attention network for image captioning. In: Proceedings of the AAAI Conference on Artificial Intelligence. pp. 8957\u20138964.","DOI":"10.1609\/aaai.v33i01.33018957"},{"key":"10.1016\/j.engappai.2026.114652_b49","doi-asserted-by":"crossref","first-page":"11900","DOI":"10.1109\/TCSVT.2024.3425513","article-title":"Regular constrained multimodal fusion for image captioning","volume":"34","author":"Wang","year":"2024","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.engappai.2026.114652_b50","doi-asserted-by":"crossref","DOI":"10.1016\/j.engappai.2022.105194","article-title":"Dynamic-balanced double-attention fusion for image captioning","volume":"114","author":"Wang","year":"2022","journal-title":"Eng. Appl. Artif. Intell."},{"key":"10.1016\/j.engappai.2026.114652_b51","doi-asserted-by":"crossref","unstructured":"Wang, Y., Xu, J., Sun, Y., 2022. End-to-end transformer based model for image captioning. In: Proceedings of the AAAI Conference on Artificial Intelligence. pp. 2585\u20132594.","DOI":"10.1609\/aaai.v36i3.20160"},{"issue":"2","key":"10.1016\/j.engappai.2026.114652_b52","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3439734","article-title":"Integrating scene semantic knowledge into image captioning","volume":"17","author":"Wei","year":"2021","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl. (TOMM)"},{"key":"10.1016\/j.engappai.2026.114652_b53","doi-asserted-by":"crossref","DOI":"10.1016\/j.neunet.2024.106519","article-title":"Mining core information by evaluating semantic importance for unpaired image captioning","volume":"179","author":"Wei","year":"2024","journal-title":"Neural Netw."},{"key":"10.1016\/j.engappai.2026.114652_b54","doi-asserted-by":"crossref","unstructured":"Wu, M., Zhang, X., Sun, X., Zhou, Y., Chen, C., Gu, J., Sun, X., Ji, R., 2022. Difnet: Boosting visual information flow for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 18020\u201318029.","DOI":"10.1109\/CVPR52688.2022.01749"},{"key":"10.1016\/j.engappai.2026.114652_b55","doi-asserted-by":"crossref","first-page":"5762","DOI":"10.1109\/TCSVT.2022.3155795","article-title":"Adaptive path selection for dynamic image captioning","volume":"32","author":"Xian","year":"2022","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.engappai.2026.114652_b56","doi-asserted-by":"crossref","first-page":"129","DOI":"10.1016\/j.neunet.2022.01.011","article-title":"Dual global enhanced transformer for image captioning","volume":"148","author":"Xian","year":"2022","journal-title":"Neural Netw."},{"key":"10.1016\/j.engappai.2026.114652_b57","unstructured":"Xu, K., Ba, J.L., Kiros, R., Cho, K., Courville, A., Salakhutdinov, R., Zemel, R.S., Bengio, Y., 2015. Show, attend and tell: Neural image caption generation with visual attention. In: Proceedings of the International Conference on Machine Learning. pp. 2048\u20132057."},{"key":"10.1016\/j.engappai.2026.114652_b58","doi-asserted-by":"crossref","first-page":"43","DOI":"10.1109\/TCSVT.2021.3067449","article-title":"Task-adaptive attention for image captioning","volume":"32","author":"Yan","year":"2021","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.engappai.2026.114652_b59","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"10685","article-title":"Auto-encoding scene graphs for image captioning","author":"Yang","year":"2019"},{"key":"10.1016\/j.engappai.2026.114652_b60","doi-asserted-by":"crossref","unstructured":"Yao, T., Pan, Y., Li, Y., Mei, T., 2019. Hierarchy parsing for image captioning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 2621\u20132629.","DOI":"10.1109\/ICCV.2019.00271"},{"key":"10.1016\/j.engappai.2026.114652_b61","doi-asserted-by":"crossref","first-page":"4467","DOI":"10.1109\/TCSVT.2019.2947482","article-title":"Multimodal transformer with multi-view visual representation for image captioning","volume":"30","author":"Yu","year":"2019","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.engappai.2026.114652_b62","doi-asserted-by":"crossref","unstructured":"Yu, Z., Yu, J., Cui, Y., Tao, D., Tian, Q., 2019. Deep modular co-attention networks for visual question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 6281\u20136290.","DOI":"10.1109\/CVPR.2019.00644"},{"key":"10.1016\/j.engappai.2026.114652_b63","doi-asserted-by":"crossref","unstructured":"Zeng, P., Zhu, J., Song, J., Gao, L., 2022. Progressive tree-structured prototype network for end-to-end image captioning. In: Proceedings of the 30th ACM International Conference on Multimedia. pp. 5210\u20135218.","DOI":"10.1145\/3503161.3548024"},{"key":"10.1016\/j.engappai.2026.114652_b64","doi-asserted-by":"crossref","first-page":"1785","DOI":"10.1109\/TNNLS.2022.3185320","article-title":"Adaptive semantic-enhanced transformer for image captioning","volume":"35","author":"Zhang","year":"2022","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"10.1016\/j.engappai.2026.114652_b65","doi-asserted-by":"crossref","unstructured":"Zhang, P., Li, X., Hu, X., Yang, J., Zhang, L., Wang, L., Choi, Y., Gao, J., 2021. Vinvl: Revisiting visual representations in vision-language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 5579\u20135588.","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"10.1016\/j.engappai.2026.114652_b66","doi-asserted-by":"crossref","DOI":"10.1016\/j.jvcir.2021.103044","article-title":"Parallel-fusion LSTM with synchronous semantic and visual information for image captioning","volume":"75","author":"Zhang","year":"2021","journal-title":"J. Vis. Commun. Image Represent."},{"key":"10.1016\/j.engappai.2026.114652_b67","doi-asserted-by":"crossref","first-page":"43","DOI":"10.1016\/j.patrec.2020.12.020","article-title":"Image captioning with transformer and knowledge graph","volume":"143","author":"Zhang","year":"2021","journal-title":"Pattern Recognit. Lett."},{"key":"10.1016\/j.engappai.2026.114652_b68","doi-asserted-by":"crossref","unstructured":"Zhang, X., Sun, X., Luo, Y., Ji, J., Zhou, Y., Wu, Y., Huang, F., Ji, R., 2021. Rstnet: Captioning with adaptive attention on visual and non-visual words. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 15465\u201315474.","DOI":"10.1109\/CVPR46437.2021.01521"},{"key":"10.1016\/j.engappai.2026.114652_b69","doi-asserted-by":"crossref","DOI":"10.1016\/j.engappai.2025.111934","article-title":"Simplified syntax-guided domain-shared representation learning for cross-domain aspect-based sentiment analysis","volume":"160","author":"Zhang","year":"2025","journal-title":"Eng. Appl. Artif. Intell."},{"key":"10.1016\/j.engappai.2026.114652_b70","doi-asserted-by":"crossref","first-page":"4829","DOI":"10.1109\/TCSVT.2023.3336371","article-title":"SPT: Spatial pyramid transformer for image captioning","volume":"34","author":"Zhang","year":"2023","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.engappai.2026.114652_b71","doi-asserted-by":"crossref","DOI":"10.1016\/j.engappai.2024.109102","article-title":"Multi-scale features with temporal information guidance for video captioning","volume":"137","author":"Zhao","year":"2024","journal-title":"Eng. Appl. Artif. Intell."},{"key":"10.1016\/j.engappai.2026.114652_b72","doi-asserted-by":"crossref","unstructured":"Zhou, H., Zhang, S., Peng, J., Zhang, S., Li, J., Xiong, H., Zhang, W., 2021. Informer: Beyond efficient transformer for long sequence time-series forecasting. In: Proceedings of the AAAI Conference on Artificial Intelligence. pp. 11106\u201311115.","DOI":"10.1609\/aaai.v35i12.17325"},{"key":"10.1016\/j.engappai.2026.114652_b73","doi-asserted-by":"crossref","DOI":"10.1016\/j.engappai.2025.110330","article-title":"Geometry-sensitive semantic modeling in visual and visual-language domains for image captioning","volume":"147","author":"Zhu","year":"2025","journal-title":"Eng. Appl. Artif. Intell."},{"key":"10.1016\/j.engappai.2026.114652_b74","doi-asserted-by":"crossref","DOI":"10.1016\/j.engappai.2024.109884","article-title":"A cross-modal collaborative guiding network for sarcasm explanation in multi-modal multi-party dialogues","volume":"142","author":"Zhuang","year":"2025","journal-title":"Eng. Appl. Artif. Intell."},{"key":"10.1016\/j.engappai.2026.114652_b75","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2025.113029","article-title":"DyCR-Net: A dynamic context-aware routing network for multi-modal sarcasm detection in conversation","volume":"310","author":"Zhuang","year":"2025","journal-title":"Knowl.-Based Syst."}],"container-title":["Engineering Applications of Artificial Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0952197626009346?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0952197626009346?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,14]],"date-time":"2026-05-14T17:17:45Z","timestamp":1778779065000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0952197626009346"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,7]]},"references-count":75,"alternative-id":["S0952197626009346"],"URL":"https:\/\/doi.org\/10.1016\/j.engappai.2026.114652","relation":{},"ISSN":["0952-1976"],"issn-type":[{"value":"0952-1976","type":"print"}],"subject":[],"published":{"date-parts":[[2026,7]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Improving Transformer-based image captioning via deep collaborative cross-fusion","name":"articletitle","label":"Article Title"},{"value":"Engineering Applications of Artificial Intelligence","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.engappai.2026.114652","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"114652"}}