{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,26]],"date-time":"2026-05-26T16:03:36Z","timestamp":1779811416163,"version":"3.53.1"},"reference-count":87,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100004543","name":"China Scholarship Council","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100004543","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100010264","name":"Shanghai Maritime University","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100010264","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Engineering Applications of Artificial Intelligence"],"published-print":{"date-parts":[[2026,8]]},"DOI":"10.1016\/j.engappai.2026.114849","type":"journal-article","created":{"date-parts":[[2026,5,14]],"date-time":"2026-05-14T11:22:19Z","timestamp":1778757739000},"page":"114849","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"P1","title":["Multi-Granularity Modal Interaction and Fusion framework for vision-language tasks"],"prefix":"10.1016","volume":"178","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7714-0798","authenticated-orcid":false,"given":"Yangshuyi","family":"Xu","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0325-6760","authenticated-orcid":false,"given":"Guangzhong","family":"Liu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6894-2550","authenticated-orcid":false,"given":"Xiang","family":"Shen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-3112-6699","authenticated-orcid":false,"given":"Xiuying","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1634-9840","authenticated-orcid":false,"given":"Huiyu","family":"Zhou","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.engappai.2026.114849_b1","doi-asserted-by":"crossref","unstructured":"Anderson, P., He, X., Buehler, C., Teney, D., Johnson, M., Gould, S., Zhang, L., 2018. Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 6077\u20136086.","DOI":"10.1109\/CVPR.2018.00636"},{"key":"10.1016\/j.engappai.2026.114849_b2","doi-asserted-by":"crossref","first-page":"10209","DOI":"10.1007\/s00521-019-04559-1","article-title":"Gated multimodal networks","volume":"32","author":"Arevalo","year":"2020","journal-title":"Neural Comput. Appl."},{"key":"10.1016\/j.engappai.2026.114849_b3","first-page":"I","article-title":"Attention is all you need","volume":"30","author":"Ashish","year":"2017","journal-title":"Adv. Neural Inf. Process. Syst."},{"issue":"2","key":"10.1016\/j.engappai.2026.114849_b4","doi-asserted-by":"crossref","first-page":"1135","DOI":"10.1109\/TCSVT.2023.3291379","article-title":"See and learn more: Dense caption-aware representation for visual question answering","volume":"34","author":"Bi","year":"2024","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.engappai.2026.114849_b5","first-page":"1","article-title":"Bilateral cross-modality graph matching attention for feature fusion in visual question answering","author":"Cao","year":"2022","journal-title":"IEEE Trans. Neural Networks Learn. Syst."},{"key":"10.1016\/j.engappai.2026.114849_b6","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2022.108980","article-title":"CAAN: Context-aware attention network for visual question answering","volume":"132","author":"Chen","year":"2022","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.engappai.2026.114849_b7","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2023.110084","article-title":"MPCCT: Multimodal vision-language learning paradigm with context-based compact transformer","volume":"147","author":"Chen","year":"2024","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.engappai.2026.114849_b8","article-title":"Towards bias-aware visual question answering: Rectifying and mitigating comprehension biases","author":"Chen","year":"2024","journal-title":"Expert Syst. Appl."},{"key":"10.1016\/j.engappai.2026.114849_b9","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2023.110706","article-title":"CLVIN: Complete language-vision interaction network for visual question answering","volume":"275","author":"Chen","year":"2023","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.engappai.2026.114849_b10","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2023.122955","article-title":"Dual-adaptive interactive transformer with textual and visual context for image captioning","volume":"243","author":"Chen","year":"2024","journal-title":"Expert Syst. Appl."},{"key":"10.1016\/j.engappai.2026.114849_b11","first-page":"1036","article-title":"Ref-nms: Breaking proposal bottlenecks in two-stage referring expression grounding","volume":"vol. 35","author":"Chen","year":"2021"},{"key":"10.1016\/j.engappai.2026.114849_b12","doi-asserted-by":"crossref","unstructured":"Deng, J., Yang, Z., Chen, T., Zhou, W., Li, H., 2021. Transvg: End-to-end visual grounding with transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 1769\u20131779.","DOI":"10.1109\/ICCV48922.2021.00179"},{"key":"10.1016\/j.engappai.2026.114849_b13","first-page":"1218","article-title":"Similarity reasoning and filtration for image-text matching","volume":"vol. 35","author":"Diao","year":"2021"},{"key":"10.1016\/j.engappai.2026.114849_b14","doi-asserted-by":"crossref","first-page":"4217","DOI":"10.1109\/TMM.2023.3321404","article-title":"SGIR: Star graph-based interaction for efficient and robust multimodal representation","volume":"26","author":"Ding","year":"2024","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.engappai.2026.114849_b15","doi-asserted-by":"crossref","unstructured":"Dong, J., Koniusz, P., Feng, L., Zhang, Y., Zhu, H., Liu, W., Qu, X., Ong, Y.-S., 2025a. Robustifying zero-shot vision language models by subspaces alignment. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 21037\u201321047.","DOI":"10.1109\/ICCV51701.2025.01955"},{"key":"10.1016\/j.engappai.2026.114849_b16","doi-asserted-by":"crossref","unstructured":"Dong, J., Koniusz, P., Qu, X., Ong, Y.-S., 2025b. Stabilizing modality gap & lowering gradient norms improve zero-shot adversarial robustness of vlms. In: Proceedings of the 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining V. 1. pp. 236\u2013247.","DOI":"10.1145\/3690624.3709296"},{"key":"10.1016\/j.engappai.2026.114849_b17","series-title":"Forty-Second International Conference on Machine Learning","article-title":"Improving zero-shot adversarial robustness in vision-language models by closed-form alignment of adversarial path simplices","author":"Dong","year":"2025"},{"key":"10.1016\/j.engappai.2026.114849_b18","doi-asserted-by":"crossref","unstructured":"Dong, J., Liu, J., Qu, X., Ong, Y.-S., 2025d. Confound from All Sides, Distill with Resilience: Multi-Objective Adversarial Paths to Zero-Shot Robustness. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 624\u2013634.","DOI":"10.1109\/ICCV51701.2025.00066"},{"key":"10.1016\/j.engappai.2026.114849_b19","series-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2020"},{"key":"10.1016\/j.engappai.2026.114849_b20","series-title":"2022 IEEE International Conference on Multimedia and Expo","first-page":"1","article-title":"Visual grounding with transformers","author":"Du","year":"2022"},{"key":"10.1016\/j.engappai.2026.114849_b21","doi-asserted-by":"crossref","unstructured":"Fu, Z., Mao, Z., Song, Y., Zhang, Y., 2023. Learning semantic relationship among instances for image-text matching. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 15159\u201315168.","DOI":"10.1109\/CVPR52729.2023.01455"},{"key":"10.1016\/j.engappai.2026.114849_b22","series-title":"2015 IEEE International Conference on Computer Vision","first-page":"1440","article-title":"Fast R-CNN","author":"Girshick","year":"2015"},{"issue":"1","key":"10.1016\/j.engappai.2026.114849_b23","doi-asserted-by":"crossref","first-page":"586","DOI":"10.1007\/s10489-022-03559-4","article-title":"Sparse co-attention visual question answering networks based on thresholds","volume":"53","author":"Guo","year":"2023","journal-title":"Appl. Intell."},{"key":"10.1016\/j.engappai.2026.114849_b24","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2024.125658","article-title":"LRCN: Layer-residual co-attention networks for visual question answering","volume":"263","author":"Han","year":"2025","journal-title":"Expert Syst. Appl."},{"key":"10.1016\/j.engappai.2026.114849_b25","doi-asserted-by":"crossref","DOI":"10.1109\/TIP.2023.3318949","article-title":"Semantic-aware modular capsule routing for visual question answering","author":"Han","year":"2023","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.engappai.2026.114849_b26","doi-asserted-by":"crossref","first-page":"5537","DOI":"10.1109\/TIP.2023.3318949","article-title":"Semantic-aware modular capsule routing for visual question answering","volume":"32","author":"Han","year":"2023","journal-title":"IEEE Trans. Image Process."},{"issue":"2","key":"10.1016\/j.engappai.2026.114849_b27","doi-asserted-by":"crossref","first-page":"684","DOI":"10.1109\/TPAMI.2019.2911066","article-title":"Learning to compose and reason with language tree structures for visual grounding","volume":"44","author":"Hong","year":"2019","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.engappai.2026.114849_b28","doi-asserted-by":"crossref","unstructured":"Huang, B., Lian, D., Luo, W., Gao, S., 2021. Look before you leap: Learning landmark features for one-stage visual grounding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 16888\u201316897.","DOI":"10.1109\/CVPR46437.2021.01661"},{"key":"10.1016\/j.engappai.2026.114849_b29","doi-asserted-by":"crossref","unstructured":"Hudson, D.A., Manning, C.D., 2019. Gqa: A new dataset for real-world visual reasoning and compositional question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 6700\u20136709.","DOI":"10.1109\/CVPR.2019.00686"},{"key":"10.1016\/j.engappai.2026.114849_b30","doi-asserted-by":"crossref","unstructured":"Jiang, D., Ye, M., 2023. Cross-modal implicit relation reasoning and aligning for text-to-image person retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 2787\u20132797.","DOI":"10.1109\/CVPR52729.2023.00273"},{"key":"10.1016\/j.engappai.2026.114849_b31","series-title":"International Conference on Machine Learning","first-page":"5583","article-title":"Vilt: Vision-and-language transformer without convolution or region supervision","author":"Kim","year":"2021"},{"key":"10.1016\/j.engappai.2026.114849_b32","doi-asserted-by":"crossref","unstructured":"Lee, K.-H., Chen, X., Hua, G., Hu, H., He, X., 2018. Stacked cross attention for image-text matching. In: Proceedings of the European Conference on Computer Vision. ECCV, pp. 201\u2013216.","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"10.1016\/j.engappai.2026.114849_b33","doi-asserted-by":"crossref","unstructured":"Li, L., Gan, Z., Cheng, Y., Liu, J., 2019. Relation-aware graph attention network for visual question answering. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 10313\u201310322.","DOI":"10.1109\/ICCV.2019.01041"},{"key":"10.1016\/j.engappai.2026.114849_b34","first-page":"19652","article-title":"Referring transformer: A one-step approach to multi-task visual grounding","volume":"34","author":"Li","year":"2021","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.engappai.2026.114849_b35","doi-asserted-by":"crossref","unstructured":"Li, K., Zhang, Y., Li, K., Li, Y., Fu, Y., 2019. Visual semantic reasoning for image-text matching. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 4654\u20134662.","DOI":"10.1109\/ICCV.2019.00475"},{"key":"10.1016\/j.engappai.2026.114849_b36","article-title":"Improving image-text matching with bidirectional consistency of cross-modal alignment","author":"Li","year":"2024","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"8","key":"10.1016\/j.engappai.2026.114849_b37","doi-asserted-by":"crossref","first-page":"10055","DOI":"10.1109\/TPAMI.2023.3262578","article-title":"Local-global context aware transformer for language-guided video segmentation","volume":"45","author":"Liang","year":"2023","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.engappai.2026.114849_b38","doi-asserted-by":"crossref","unstructured":"Liao, Y., Liu, S., Li, G., Wang, F., Chen, Y., Qian, C., Li, B., 2020. A real-time cross-modality correlation filtering method for referring expression comprehension. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 10880\u201310889.","DOI":"10.1109\/CVPR42600.2020.01089"},{"key":"10.1016\/j.engappai.2026.114849_b39","doi-asserted-by":"crossref","unstructured":"Lin, H., Meng, F., Su, J., Yin, Y., Yang, Z., Ge, Y., Zhou, J., Luo, J., 2020. Dynamic context-guided capsule network for multimodal machine translation. In: Proceedings of the 28th ACM International Conference on Multimedia. pp. 1320\u20131329.","DOI":"10.1145\/3394171.3413715"},{"key":"10.1016\/j.engappai.2026.114849_b40","series-title":"ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"8125","article-title":"Prompting large language models with fine-grained visual relations from scene graph for visual question answering","author":"Liu","year":"2024"},{"issue":"10","key":"10.1016\/j.engappai.2026.114849_b41","doi-asserted-by":"crossref","first-page":"11624","DOI":"10.1109\/TPAMI.2023.3284038","article-title":"Cross-modal causal relational reasoning for event-level visual question answering","volume":"45","author":"Liu","year":"2023","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.engappai.2026.114849_b42","article-title":"M2IST: Multi-modal interactive side-tuning for efficient referring expression comprehension","author":"Liu","year":"2025","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.engappai.2026.114849_b43","doi-asserted-by":"crossref","unstructured":"Liu, C., Mao, Z., Zhang, T., Xie, H., Wang, B., Zhang, Y., 2020. Graph structured network for image-text matching. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 10921\u201310930.","DOI":"10.1109\/CVPR42600.2020.01093"},{"key":"10.1016\/j.engappai.2026.114849_b44","doi-asserted-by":"crossref","unstructured":"Liu, X., Wang, Z., Shao, J., Wang, X., Li, H., 2019. Improving referring expression grounding with cross-modal attention-guided erasing. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 1950\u20131959.","DOI":"10.1109\/CVPR.2019.00205"},{"key":"10.1016\/j.engappai.2026.114849_b45","doi-asserted-by":"crossref","unstructured":"Liu, D., Zhang, H., Wu, F., Zha, Z.-J., 2019. Learning to assemble neural module tree networks for visual grounding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 4673\u20134682.","DOI":"10.1109\/ICCV.2019.00477"},{"key":"10.1016\/j.engappai.2026.114849_b46","doi-asserted-by":"crossref","DOI":"10.7717\/peerj-cs.1400","article-title":"The multi-modal fusion in visual question answering: a review of attention mechanisms","volume":"9","author":"Lu","year":"2023","journal-title":"PeerJ Comput. Sci."},{"key":"10.1016\/j.engappai.2026.114849_b47","article-title":"Hierarchical question-image co-attention for visual question answering","volume":"29","author":"Lu","year":"2016","journal-title":"Adv. Neural Inf. Process. Syst."},{"issue":"3","key":"10.1016\/j.engappai.2026.114849_b48","doi-asserted-by":"crossref","first-page":"1380","DOI":"10.1109\/TNNLS.2021.3105284","article-title":"Multitask learning for visual question answering","volume":"34","author":"Ma","year":"2021","journal-title":"IEEE Trans. Neural Networks Learn. Syst."},{"key":"10.1016\/j.engappai.2026.114849_b49","doi-asserted-by":"crossref","unstructured":"Mascharka, D., Tran, P., Soklaski, R., Majumdar, A., 2018. Transparency by design: Closing the gap between performance and interpretability in visual reasoning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 4942\u20134950.","DOI":"10.1109\/CVPR.2018.00519"},{"key":"10.1016\/j.engappai.2026.114849_b50","doi-asserted-by":"crossref","first-page":"1341","DOI":"10.1109\/TLT.2024.3383773","article-title":"ChatGPT for educational purposes: Investigating the impact of knowledge management factors on student satisfaction and continuous usage","volume":"17","author":"Ngo","year":"2024","journal-title":"IEEE Trans. Learn. Technol."},{"key":"10.1016\/j.engappai.2026.114849_b51","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.110900","article-title":"GADNet: Improving image\u2013text matching via graph-based aggregation and disentanglement","volume":"157","author":"Pu","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.engappai.2026.114849_b52","doi-asserted-by":"crossref","unstructured":"Qiu, H., Wang, L., Zhao, T., Meng, F., Li, H., 2024. HumanFormer: Human-centric Prompting Multi-modal Perception Transformer for Referring Crowd Detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 5530\u20135540.","DOI":"10.1109\/CVPRW63382.2024.00562"},{"key":"10.1016\/j.engappai.2026.114849_b53","doi-asserted-by":"crossref","DOI":"10.1016\/j.neunet.2025.107871","article-title":"A triple-branch hybrid dynamic-static alignment strategy for vision-language tasks","author":"Shen","year":"2025","journal-title":"Neural Netw."},{"issue":"6","key":"10.1016\/j.engappai.2026.114849_b54","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1007\/s10462-025-11163-4","article-title":"GFSNet: Gaussian Fourier with sparse attention network for visual question answering","volume":"58","author":"Shen","year":"2025","journal-title":"Artif. Intell. Rev."},{"issue":"13","key":"10.1016\/j.engappai.2026.114849_b55","doi-asserted-by":"crossref","first-page":"16706","DOI":"10.1007\/s10489-022-04355-w","article-title":"Local self-attention in transformer for visual question answering","volume":"53","author":"Shen","year":"2023","journal-title":"Appl. Intell."},{"issue":"6","key":"10.1016\/j.engappai.2026.114849_b56","doi-asserted-by":"crossref","first-page":"5062","DOI":"10.1007\/s10489-024-05437-7","article-title":"Relational reasoning and adaptive fusion for visual question answering","volume":"54","author":"Shen","year":"2024","journal-title":"Appl. Intell."},{"issue":"2","key":"10.1016\/j.engappai.2026.114849_b57","doi-asserted-by":"crossref","first-page":"1181","DOI":"10.1109\/TPAMI.2023.3328185","article-title":"Dynamic MDETR: A dynamic multimodal transformer decoder for visual grounding","volume":"46","author":"Shi","year":"2024","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"6","key":"10.1016\/j.engappai.2026.114849_b58","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1007\/s00530-024-01568-6","article-title":"KTMN: Knowledge-driven two-stage modulation network for visual question answering","volume":"30","author":"Shi","year":"2024","journal-title":"Multimedia Syst."},{"key":"10.1016\/j.engappai.2026.114849_b59","first-page":"1","article-title":"SAFFNet: self-attention based on Fourier frequency domain filter network for visual question answering","author":"Shi","year":"2025","journal-title":"Vis. Comput."},{"key":"10.1016\/j.engappai.2026.114849_b60","doi-asserted-by":"crossref","unstructured":"Shi, J., Zhang, H., Li, J., 2019. Explainable and explicit visual reasoning over scene graphs. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 8376\u20138384.","DOI":"10.1109\/CVPR.2019.00857"},{"key":"10.1016\/j.engappai.2026.114849_b61","article-title":"Vman: visual-modified attention network for multimodal paradigms","author":"Song","year":"2024","journal-title":"Vis. Comput."},{"issue":"5","key":"10.1016\/j.engappai.2026.114849_b62","doi-asserted-by":"crossref","first-page":"3213","DOI":"10.1109\/TPAMI.2023.3339628","article-title":"Context disentangling and prototype inheriting for robust visual grounding","volume":"46","author":"Tang","year":"2024","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.engappai.2026.114849_b63","series-title":"Gemini: a family of highly capable multimodal models","author":"Team","year":"2023"},{"key":"10.1016\/j.engappai.2026.114849_b64","unstructured":"Tian, W., Li, H., Zhao, Z.-Q., 2022. Dual capsule attention mask network with mutual learning for visual question answering. In: Proceedings of the 29th International Conference on Computational Linguistics. pp. 5678\u20135688."},{"issue":"6","key":"10.1016\/j.engappai.2026.114849_b65","doi-asserted-by":"crossref","DOI":"10.1007\/s11432-024-4333-7","article-title":"DiagLLM: multimodal reasoning with large language model for explainable bearing fault diagnosis","volume":"68","author":"Wang","year":"2025","journal-title":"Sci. China Inf. Sci."},{"key":"10.1016\/j.engappai.2026.114849_b66","doi-asserted-by":"crossref","DOI":"10.1016\/j.inffus.2023.102132","article-title":"Cross-modal incongruity aligning and collaborating for multi-modal sarcasm detection","volume":"103","author":"Wang","year":"2024","journal-title":"Inf. Fusion"},{"key":"10.1016\/j.engappai.2026.114849_b67","doi-asserted-by":"crossref","first-page":"111","DOI":"10.1109\/TASLP.2022.3221017","article-title":"M3S: Scene graph driven multi-granularity multi-task learning for multi-modal NER","volume":"31","author":"Wang","year":"2022","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.engappai.2026.114849_b68","article-title":"Matryoshka learning with metric transfer for image-text matching","author":"Wang","year":"2025","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.engappai.2026.114849_b69","series-title":"2024 IEEE 4th International Conference on Electronic Technology, Communication and Information","first-page":"1185","article-title":"Advanced multimodal deep learning architecture for image-text matching","author":"Wang","year":"2024"},{"key":"10.1016\/j.engappai.2026.114849_b70","first-page":"44","article-title":"Gita: Graph to visual and textual integration for vision-language graph reasoning","volume":"37","author":"Wei","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."},{"issue":"10","key":"10.1016\/j.engappai.2026.114849_b71","doi-asserted-by":"crossref","first-page":"10399","DOI":"10.1109\/TCSVT.2024.3407785","article-title":"Visual grounding with dual knowledge distillation","volume":"34","author":"Wu","year":"2024","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.engappai.2026.114849_b72","series-title":"2023 IEEE International Conference on Big Data (BigData)","first-page":"2247","article-title":"Multimodal large language models: A survey","author":"Wu","year":"2023"},{"issue":"9","key":"10.1016\/j.engappai.2026.114849_b73","doi-asserted-by":"crossref","first-page":"233","DOI":"10.1007\/s10462-024-10835-x","article-title":"An effective multi-modal adaptive contextual feature information fusion method for Chinese long text classification","volume":"57","author":"Xu","year":"2024","journal-title":"Artif. Intell. Rev."},{"issue":"6","key":"10.1016\/j.engappai.2026.114849_b74","doi-asserted-by":"crossref","DOI":"10.1371\/journal.pone.0287557","article-title":"Multi-modal adaptive gated mechanism for visual question answering","volume":"18","author":"Xu","year":"2023","journal-title":"PLoS One"},{"issue":"1","key":"10.1016\/j.engappai.2026.114849_b75","doi-asserted-by":"crossref","first-page":"2","DOI":"10.1007\/s13735-025-00394-4","article-title":"CSAM: Capsule spatial attention mask network for visual question answering","volume":"15","author":"Xue","year":"2026","journal-title":"Int. J. Multimed. Inf. Retr."},{"key":"10.1016\/j.engappai.2026.114849_b76","series-title":"Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XIV 16","first-page":"387","article-title":"Improving one-stage visual grounding by recursive sub-query construction","author":"Yang","year":"2020"},{"key":"10.1016\/j.engappai.2026.114849_b77","doi-asserted-by":"crossref","unstructured":"Yang, Z., He, X., Gao, J., Deng, L., Smola, A., 2016. Stacked attention networks for image question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 21\u201329.","DOI":"10.1109\/CVPR.2016.10"},{"key":"10.1016\/j.engappai.2026.114849_b78","doi-asserted-by":"crossref","unstructured":"Yu, Z., Cui, Y., Yu, J., Wang, M., Tao, D., Tian, Q., 2020. Deep multimodal neural architecture search. In: Proceedings of the 28th ACM International Conference on Multimedia. pp. 3743\u20133752.","DOI":"10.1145\/3394171.3413977"},{"key":"10.1016\/j.engappai.2026.114849_b79","doi-asserted-by":"crossref","first-page":"292","DOI":"10.1016\/j.future.2023.01.004","article-title":"Multi-scale image\u2013text matching network for scene and spatio-temporal images","volume":"142","author":"Yu","year":"2023","journal-title":"Future Gener. Comput. Syst."},{"key":"10.1016\/j.engappai.2026.114849_b80","doi-asserted-by":"crossref","unstructured":"Yu, Z., Yu, J., Cui, Y., Tao, D., Tian, Q., 2019. Deep modular co-attention networks for visual question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 6281\u20136290.","DOI":"10.1109\/CVPR.2019.00644"},{"key":"10.1016\/j.engappai.2026.114849_b81","first-page":"3262","article-title":"Show your faith: Cross-modal confidence-aware network for image-text matching","volume":"vol. 36","author":"Zhang","year":"2022"},{"key":"10.1016\/j.engappai.2026.114849_b82","series-title":"2023 9th International Conference on Mechatronics and Robotics Engineering","first-page":"84","article-title":"Similarity contrastive capsule transformation for image-text matching","author":"Zhang","year":"2023"},{"key":"10.1016\/j.engappai.2026.114849_b83","doi-asserted-by":"crossref","first-page":"70","DOI":"10.1016\/j.inffus.2021.02.006","article-title":"DMRFNet: deep multimodal reasoning and fusion for visual question answering and explanation generation","volume":"72","author":"Zhang","year":"2021","journal-title":"Inf. Fusion"},{"issue":"6","key":"10.1016\/j.engappai.2026.114849_b84","doi-asserted-by":"crossref","first-page":"4829","DOI":"10.1109\/TCSVT.2023.3336371","article-title":"SPT: Spatial pyramid transformer for image captioning","volume":"34","author":"Zhang","year":"2024","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"1","key":"10.1016\/j.engappai.2026.114849_b85","doi-asserted-by":"crossref","first-page":"134","DOI":"10.1109\/TNNLS.2021.3090426","article-title":"A real-time global inference network for one-stage referring expression comprehension","volume":"34","author":"Zhou","year":"2021","journal-title":"IEEE Trans. Neural Networks Learn. Syst."},{"key":"10.1016\/j.engappai.2026.114849_b86","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Ren, T., Zhu, C., Sun, X., Liu, J., Ding, X., Xu, M., Ji, R., 2021b. Trar: Routing the attention spans in transformer for visual question answering. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 2074\u20132084.","DOI":"10.1109\/ICCV48922.2021.00208"},{"issue":"2","key":"10.1016\/j.engappai.2026.114849_b87","doi-asserted-by":"crossref","first-page":"715","DOI":"10.1109\/TKDE.2022.3224228","article-title":"Multi-modal knowledge graph construction and application: A survey","volume":"36","author":"Zhu","year":"2024","journal-title":"IEEE Trans. Knowl. Data Eng."}],"container-title":["Engineering Applications of Artificial Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0952197626011310?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0952197626011310?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,26]],"date-time":"2026-05-26T15:11:40Z","timestamp":1779808300000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0952197626011310"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,8]]},"references-count":87,"alternative-id":["S0952197626011310"],"URL":"https:\/\/doi.org\/10.1016\/j.engappai.2026.114849","relation":{},"ISSN":["0952-1976"],"issn-type":[{"value":"0952-1976","type":"print"}],"subject":[],"published":{"date-parts":[[2026,8]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Multi-Granularity Modal Interaction and Fusion framework for vision-language tasks","name":"articletitle","label":"Article Title"},{"value":"Engineering Applications of Artificial Intelligence","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.engappai.2026.114849","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"114849"}}