{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,21]],"date-time":"2025-05-21T04:21:39Z","timestamp":1747801299267,"version":"3.41.0"},"reference-count":57,"publisher":"Springer Science and Business Media LLC","issue":"6","license":[{"start":{"date-parts":[[2025,3,13]],"date-time":"2025-03-13T00:00:00Z","timestamp":1741824000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,3,13]],"date-time":"2025-03-13T00:00:00Z","timestamp":1741824000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U21A20390","U21A20390","U21A20390","U21A20390"],"award-info":[{"award-number":["U21A20390","U21A20390","U21A20390","U21A20390"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Development Project of Jilin Province of China","award":["20240601039RC","20240601039RC","20240601039RC","20240601039RC"],"award-info":[{"award-number":["20240601039RC","20240601039RC","20240601039RC","20240601039RC"]}]},{"name":"the Fundamental Research Funds for the Central University, JLU"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Knowl Inf Syst"],"published-print":{"date-parts":[[2025,6]]},"DOI":"10.1007\/s10115-025-02384-8","type":"journal-article","created":{"date-parts":[[2025,3,13]],"date-time":"2025-03-13T02:39:57Z","timestamp":1741833597000},"page":"4949-4966","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Prompting visual dialog with implicit logical knowledge"],"prefix":"10.1007","volume":"67","author":[{"given":"Zefan","family":"Zhang","sequence":"first","affiliation":[]},{"given":"Yanhui","family":"Li","sequence":"additional","affiliation":[]},{"given":"Weiqi","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Tian","family":"Bai","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,3,13]]},"reference":[{"doi-asserted-by":"crossref","unstructured":"Das A, Kottur S, Gupta K, Singh A, Yadav D, Moura JM, Parikh D, Batra D (2017) Visual dialog. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 326\u2013335","key":"2384_CR1","DOI":"10.1109\/CVPR.2017.121"},{"doi-asserted-by":"crossref","unstructured":"Zhang W, et\u00a0al (2024) Caption-aware multimodal relation extraction with mutual information maximization. In: ACM Multimedia 2024","key":"2384_CR2","DOI":"10.1145\/3664647.3681219"},{"issue":"3","key":"2384_CR3","doi-asserted-by":"publisher","first-page":"2193","DOI":"10.1007\/s10115-023-02028-9","volume":"66","author":"Z Su","year":"2024","unstructured":"Su Z, Gou G (2024) Knowledge enhancement and scene understanding for knowledge-based visual question answering. Knowl Inf Syst 66(3):2193\u20132208","journal-title":"Knowl Inf Syst"},{"issue":"2","key":"2384_CR4","doi-asserted-by":"publisher","first-page":"921","DOI":"10.1007\/s10115-022-01775-5","volume":"65","author":"S Li","year":"2023","unstructured":"Li S, Luo C, Zhu Y, Wu W (2023) Bold driver and static restart fused adaptive momentum for visual question answering. Knowl Inf Syst 65(2):921\u2013943","journal-title":"Knowl Inf Syst"},{"doi-asserted-by":"publisher","unstructured":"Zhao L, Gao L, Guo Y, Song J, Shen H (2021) Skanet: Structured knowledge-aware network for visual dialog. In: 2021 IEEE international conference on multimedia and expo (ICME), pp. 1\u20136. https:\/\/doi.org\/10.1109\/ICME51207.2021.9428279","key":"2384_CR5","DOI":"10.1109\/ICME51207.2021.9428279"},{"doi-asserted-by":"publisher","unstructured":"Zhang S, Jiang X, Yang Z, Wan T, Qin Z (2022) Reasoning with multi-structure commonsense knowledge in visual dialog. In: 2022 IEEE\/CVF conference on computer vision and pattern recognition workshops (CVPRW), pp. 4599\u20134608. https:\/\/doi.org\/10.1109\/CVPRW56347.2022.00506","key":"2384_CR6","DOI":"10.1109\/CVPRW56347.2022.00506"},{"doi-asserted-by":"publisher","unstructured":"Zhang Z, Ji Y, Liu C (2023) Knowledge-aware causal inference network for visual dialog. In: Proceedings of the 2023 ACM international conference on multimedia retrieval. ICMR \u201923, pp. 253\u2013261. Association for Computing Machinery, New York, NY, USA. https:\/\/doi.org\/10.1145\/3591106.3592272","key":"2384_CR7","DOI":"10.1145\/3591106.3592272"},{"doi-asserted-by":"publisher","unstructured":"Shao Z, Yu Z, Wang M, Yu J (2023) Prompting large language models with answer heuristics for knowledge-based visual question answering. In: 2023 IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp. 14974\u201314983 . https:\/\/doi.org\/10.1109\/CVPR52729.2023.01438","key":"2384_CR8","DOI":"10.1109\/CVPR52729.2023.01438"},{"unstructured":"Zhao WX, Zhou K, Li J, Tang T, Wang X, Hou Y, Min Y, Zhang B, Zhang J, Dong Z, Du Y, Yang C, Chen Y, Chen Z, Jiang J, Ren R, Li Y, Tang X, Liu Z, Liu P, Nie J-Y, Wen J-R (2023) A survey of large language models","key":"2384_CR9"},{"doi-asserted-by":"publisher","unstructured":"Zhang X, Zhang F, Xu C (2023) Vqacl: A novel visual question answering continual learning setting. In: 2023 IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp. 19102\u201319112. https:\/\/doi.org\/10.1109\/CVPR52729.2023.01831","key":"2384_CR10","DOI":"10.1109\/CVPR52729.2023.01831"},{"doi-asserted-by":"publisher","unstructured":"Cho JW, Kim D-J, Ryu H, Kweon IS (2023) Generative bias for robust visual question answering. In: 2023 IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp. 11681\u201311690 . https:\/\/doi.org\/10.1109\/CVPR52729.2023.01124","key":"2384_CR11","DOI":"10.1109\/CVPR52729.2023.01124"},{"doi-asserted-by":"publisher","unstructured":"Basu A, Addepalli S, Babu RV (2023) Rmlvqa: A margin loss approach for visual question answering with language biases. In: 2023 IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp. 11671\u201311680. https:\/\/doi.org\/10.1109\/CVPR52729.2023.01123","key":"2384_CR12","DOI":"10.1109\/CVPR52729.2023.01123"},{"doi-asserted-by":"publisher","unstructured":"Jiang J, Zheng N (2023) Mixphm: Redundancy-aware parameter-efficient tuning for low-resource visual question answering. In: 2023 IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp. 24203\u201324213 . https:\/\/doi.org\/10.1109\/CVPR52729.2023.02318","key":"2384_CR13","DOI":"10.1109\/CVPR52729.2023.02318"},{"doi-asserted-by":"publisher","unstructured":"Guo J, Li J, Li D, Huat\u00a0Tiong AM, Li B, Tao D, Hoi S (2023) From images to textual prompts: Zero-shot visual question answering with frozen large language models. In: 2023 IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp. 10867\u201310877 . https:\/\/doi.org\/10.1109\/CVPR52729.2023.01046","key":"2384_CR14","DOI":"10.1109\/CVPR52729.2023.01046"},{"unstructured":"Bai J, Liu X, Wang W, Luo C, Song Y (2023) Complex query answering on eventuality knowledge graph with implicit logical constraints","key":"2384_CR15"},{"doi-asserted-by":"crossref","unstructured":"Li XL, Holtzman A, Fried D, Liang P, Eisner J, Hashimoto T, Zettlemoyer L, Lewis M (2023) Contrastive decoding: open-ended text generation as optimization. In: Rogers, A., Boyd-Graber, J., Okazaki, N. (eds.) Proceedings of the 61st annual meeting of the association for computational linguistics (Volume 1: Long Papers), pp. 12286\u201312312. Association for Computational Linguistics, Toronto, Canada","key":"2384_CR16","DOI":"10.18653\/v1\/2023.acl-long.687"},{"doi-asserted-by":"crossref","unstructured":"Qu F, Sun H, Wu Y (2024) Unsupervised distractor generation via large language model distilling and counterfactual contrastive decoding. In: Ku L-W, Martins A, Srikumar V (eds) Findings of the association for computational linguistics ACL 2024. Association for Computational Linguistics, Bangkok, Thailand and virtual meeting, pp 827\u2013838","key":"2384_CR17","DOI":"10.18653\/v1\/2024.findings-acl.47"},{"doi-asserted-by":"crossref","unstructured":"Wang X, Pan J, Ding L, Biemann C (2024) Mitigating hallucinations in large vision-language models with instruction contrastive decoding. In: Ku, L.-W., Martins, A., Srikumar, V. (eds.) Findings of the association for computational linguistics ACL 2024, pp. 15840\u201315853. Association for computational linguistics, Bangkok, Thailand and virtual meeting. https:\/\/aclanthology.org\/2024.findings-acl.937","key":"2384_CR18","DOI":"10.18653\/v1\/2024.findings-acl.937"},{"doi-asserted-by":"crossref","unstructured":"Agarwal S, Bui T, Lee JY, Konstas I, Rieser V (2020) History for visual dialog: Do we really need it? In: Proceedings of the 58th annual meeting of the association for computational linguistics","key":"2384_CR19","DOI":"10.18653\/v1\/2020.acl-main.728"},{"unstructured":"Niu Y, Zhang H, Zhang M, Zhang J, Lu Z, Wen J-R Recursive visual attention in visual dialog. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 6679\u20136688","key":"2384_CR20"},{"issue":"2","key":"2384_CR21","doi-asserted-by":"publisher","first-page":"861","DOI":"10.1109\/TCSVT.2022.3207228","volume":"33","author":"L Zhao","year":"2023","unstructured":"Zhao L, Li J, Gao L, Rao Y, Song J, Shen HT (2023) Heterogeneous knowledge network for visual dialog. IEEE Trans Circuits Syst Video Technol 33(2):861\u2013871. https:\/\/doi.org\/10.1109\/TCSVT.2022.3207228","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"issue":"10","key":"2384_CR22","doi-asserted-by":"publisher","first-page":"6056","DOI":"10.1109\/TPAMI.2021.3085755","volume":"44","author":"D Guo","year":"2022","unstructured":"Guo D, Wang H, Wang M (2022) Context-aware graph inference with knowledge distillation for visual dialog. IEEE Trans Pattern Anal Mach Intell 44(10):6056\u20136073. https:\/\/doi.org\/10.1109\/TPAMI.2021.3085755","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"doi-asserted-by":"crossref","unstructured":"Murahari V, Batra D, Parikh D, Das A (2020) Large-scale pretraining for visual dialog: a simple state-of-the-art baseline. In: Vedaldi A, Bischof H, Brox T, Frahm J-M (eds) Computer Vision - ECCV 2020. Springer, Cham, pp 336\u2013352","key":"2384_CR23","DOI":"10.1007\/978-3-030-58523-5_20"},{"doi-asserted-by":"publisher","unstructured":"Wang Y, Joty S, Lyu M, King I, Xiong C, Hoi SCH (2020) VD-BERT: A Unified Vision and Dialog Transformer with BERT. In: Proceedings of the 2020 conference on empirical methods in natural language processing (EMNLP), pp. 3325\u20133338. Association for Computational Linguistics, Online . https:\/\/doi.org\/10.18653\/v1\/2020.emnlp-main.269 . https:\/\/aclanthology.org\/2020.emnlp-main.269","key":"2384_CR24","DOI":"10.18653\/v1\/2020.emnlp-main.269"},{"doi-asserted-by":"publisher","unstructured":"Chen F, Zhang D, Chen X, Shi J, Xu S, XU B (2022) Unsupervised and pseudo-supervised vision-language alignment in visual dialog. In: Proceedings of the 30th ACM international conference on multimedia. MM \u201922, pp. 4142\u20134153. Association for Computing Machinery, New York, NY, USA . https:\/\/doi.org\/10.1145\/3503161.3547776","key":"2384_CR25","DOI":"10.1145\/3503161.3547776"},{"doi-asserted-by":"publisher","unstructured":"Wang Z, Wang J, Jiang C (2022) Unified multimodal model with unlikelihood training for visual dialog. In: Proceedings of the 30th ACM international conference on multimedia. MM \u201922, pp. 4625\u20134634. Association for Computing Machinery, New York, NY, USA . https:\/\/doi.org\/10.1145\/3503161.3547974","key":"2384_CR26","DOI":"10.1145\/3503161.3547974"},{"doi-asserted-by":"publisher","unstructured":"Zhuang Y, Yu T, Wu J, Wu S, Li S (2022) Spatial-temporal aligned multi-agent learning for visual dialog systems. In: Proceedings of the 30th ACM international conference on multimedia. MM \u201922, pp. 482\u2013490. Association for Computing Machinery, New York, NY, USA . https:\/\/doi.org\/10.1145\/3503161.3548345","key":"2384_CR27","DOI":"10.1145\/3503161.3548345"},{"doi-asserted-by":"publisher","unstructured":"Chen C, Tan Z, Cheng Q, Jiang X, Liu Q, Zhu Y, Gu X (2022) Utc: A unified transformer with inter-task contrastive learning for visual dialog. In: 2022 IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp. 18082\u201318091 . https:\/\/doi.org\/10.1109\/CVPR52688.2022.01757","key":"2384_CR28","DOI":"10.1109\/CVPR52688.2022.01757"},{"issue":"2","key":"2384_CR29","doi-asserted-by":"publisher","first-page":"167","DOI":"10.3233\/SW-140134","volume":"6","author":"J Lehmann","year":"2015","unstructured":"Lehmann J, Isele R, Jakob M, Jentzsch A, Kontokostas D, Mendes PN, Hellmann S, Morsey M, Kleef P, Auer S, Bizer C (2015) Dbpedia - a large-scale, multilingual knowledge base extracted from wikipedia. Semantic Web 6(2):167\u2013195. https:\/\/doi.org\/10.3233\/SW-140134","journal-title":"Semantic Web"},{"doi-asserted-by":"crossref","unstructured":"Speer R, Chin J, Havasi C (2017) Conceptnet 5.5: An open multilingual graph of general knowledge. In: Thirty-first AAAI conference on artificial intelligence","key":"2384_CR30","DOI":"10.1609\/aaai.v31i1.11164"},{"key":"2384_CR31","doi-asserted-by":"publisher","first-page":"1639","DOI":"10.1109\/TMM.2023.3284594","volume":"26","author":"A-A Liu","year":"2024","unstructured":"Liu A-A, Huang C, Xu N, Tian H, Liu J, Zhang Y (2024) Counterfactual visual dialog: Robust commonsense knowledge learning from unbiased training. IEEE Trans Multimedia 26:1639\u20131651. https:\/\/doi.org\/10.1109\/TMM.2023.3284594","journal-title":"IEEE Trans Multimedia"},{"doi-asserted-by":"publisher","unstructured":"Yuan H, Lu K, Huang F, Yuan Z, Zhou C (2024) Speculative contrastive decoding. In: Ku, L.-W., Martins, A., Srikumar, V. (eds.) Proceedings of the 62nd annual meeting of the association for computational linguistics (Volume 2: Short Papers), pp. 56\u201364. Association for Computational Linguistics, Bangkok, Thailand. https:\/\/doi.org\/10.18653\/v1\/2024.acl-short.5 . https:\/\/aclanthology.org\/2024.acl-short.5","key":"2384_CR32","DOI":"10.18653\/v1\/2024.acl-short.5"},{"doi-asserted-by":"publisher","unstructured":"Leng S, Zhang H, Chen G, Li X, Lu S, Miao C, Bing L(2024) Mitigating object hallucinations in large vision-language models through visual contrastive decoding. In: 2024 IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp. 13872\u201313882 . https:\/\/doi.org\/10.1109\/CVPR52733.2024.01316","key":"2384_CR33","DOI":"10.1109\/CVPR52733.2024.01316"},{"doi-asserted-by":"publisher","unstructured":"Jin J, Wang H, Zhang H, Li X, Guo Z (2024) DVD: Dynamic contrastive decoding for knowledge amplification in multi-document question answering. In: Al-Onaizan, Y., Bansal, M., Chen, Y.-N. (eds.) Proceedings of the 2024 conference on empirical methods in natural language processing, pp. 4624\u20134637. Association for Computational Linguistics, Miami, Florida, USA. https:\/\/doi.org\/10.18653\/v1\/2024.emnlp-main.266 . https:\/\/aclanthology.org\/2024.emnlp-main.266","key":"2384_CR34","DOI":"10.18653\/v1\/2024.emnlp-main.266"},{"unstructured":"Touvron H, Lavril T, Izacard G, Martinet X, Lachaux M-A, Lacroix T, Rozi\u00e8re B, Goyal N, Hambro E, Azhar F, Rodriguez A, Joulin A, Grave E, Lample G (2023) LLaMA: Open and Efficient Foundation Language Models","key":"2384_CR35"},{"unstructured":"Taylor R, Kardas M, Cucurull G, Scialom T, Hartshorn A, Saravia E, Poulton A, Kerkez V, Stojnic R (2022) Galactica: a large language model for science","key":"2384_CR36"},{"unstructured":"Touvron H, Martin L, Stone K, Albert P, Almahairi A, Babaei Y, Bashlykov N, Batra S, Bhargava P, Bhosale S, Bikel D, Blecher L, Ferrer CC, Chen M, Cucurull G, Esiobu D, Fernandes J, Fu J, Fu W, Fuller B, Gao C, Goswami V, Goyal N, Hartshorn A, Hosseini S, Hou R, Inan H, Kardas M, Kerkez V, Khabsa M, Kloumann I, Korenev A, Koura PS, Lachaux M-A, Lavril T, Lee J, Liskovich D, Lu Y, Mao Y, Martinet X, Mihaylov T, Mishra P, Molybog I, Nie Y, Poulton A, Reizenstein J, Rungta R, Saladi K, Schelten A, Silva R, Smith EM, Subramanian R, Tan XE, Tang B, Taylor R, Williams A, Kuan JX, Xu P, Yan Z, Zarov I, Zhang Y, Fan A, Kambadur M, Narang S, Rodriguez ., Stojnic R, Edunov S, Scialom T (2023) Llama 2: Open Foundation and Fine-Tuned Chat Models","key":"2384_CR37"},{"issue":"6","key":"2384_CR38","doi-asserted-by":"publisher","first-page":"1137","DOI":"10.1109\/TPAMI.2016.2577031","volume":"39","author":"S Ren","year":"2017","unstructured":"Ren S, He K, Girshick R, Sun J (2017) Faster r-cnn: Towards real-time object detection with region proposal networks. IEEE Trans Pattern Anal Mach Intell 39(6):1137\u20131149","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"doi-asserted-by":"crossref","unstructured":"Pennington J, Socher R, Manning CD (2014) Glove: Global vectors for word representation. In: Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP), pp. 1532\u20131543","key":"2384_CR39","DOI":"10.3115\/v1\/D14-1162"},{"doi-asserted-by":"crossref","unstructured":"Kottur S, Moura JMF, Parikh D, Batra D, Rohrbach M (2018) Visual coreference resolution in visual dialog using neural module networks. In: Ferrari V, Hebert M, Sminchisescu C, Weiss Y (eds) Computer Vision - ECCV 2018. Springer, Cham, pp 160\u2013178","key":"2384_CR40","DOI":"10.1007\/978-3-030-01267-0_10"},{"doi-asserted-by":"crossref","unstructured":"Zheng Z, Wang W, Qi S, Zhu S-C (2019) Reasoning visual dialogs with structural and partial observations. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR)","key":"2384_CR41","DOI":"10.1109\/CVPR.2019.00683"},{"doi-asserted-by":"publisher","unstructured":"Guo D, Wang H, Wang M (2019) Dual visual attention network for visual dialog. In: Proceedings of the twenty-eighth international joint conference on artificial intelligence, IJCAI-19, pp. 4989\u20134995. International joint conferences on artificial intelligence organization. https:\/\/doi.org\/10.24963\/ijcai.2019\/693","key":"2384_CR42","DOI":"10.24963\/ijcai.2019\/693"},{"doi-asserted-by":"crossref","unstructured":"Yang T, Zha Z-J, Zhang H (2019) Making history matter: History-advantage sequence training for visual dialog. In: Proceedings of the IEEE\/cvf international conference on computer vision, pp. 2561\u20132569","key":"2384_CR43","DOI":"10.1109\/ICCV.2019.00265"},{"doi-asserted-by":"crossref","unstructured":"Guo D, Wang H, Zhang H, Zha Z-J, Wang M (2020) Iterative context-aware graph inference for visual dialog. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR)","key":"2384_CR44","DOI":"10.1109\/CVPR42600.2020.01007"},{"issue":"05","key":"2384_CR45","doi-asserted-by":"publisher","first-page":"8091","DOI":"10.1609\/aaai.v34i05.6320","volume":"34","author":"H Kim","year":"2020","unstructured":"Kim H, Tan H, Bansal M (2020) Modality-balanced models for visual dialogue. Proc AAAI Conf Artif Intell 34(05):8091\u20138098. https:\/\/doi.org\/10.1609\/aaai.v34i05.6320","journal-title":"Proc AAAI Conf Artif Intell"},{"issue":"07","key":"2384_CR46","doi-asserted-by":"publisher","first-page":"11125","DOI":"10.1609\/aaai.v34i07.6769","volume":"34","author":"X Jiang","year":"2020","unstructured":"Jiang X, Yu J, Qin Z, Zhuang Y, Zhang X, Hu Y, Wu Q (2020) Dualvd: An adaptive dual encoding model for deep visual understanding in visual dialogue. Proc AAAI Conf Artif Intell 34(07):11125\u201311132. https:\/\/doi.org\/10.1609\/aaai.v34i07.6769","journal-title":"Proc AAAI Conf Artif Intell"},{"doi-asserted-by":"crossref","unstructured":"Jiang X, Du S, Qin Z, Sun Y, Yu J (2020) KBGN: knowledge-bridge graph network for adaptive vision-text reasoning in visual dialogue","key":"2384_CR47","DOI":"10.1145\/3394171.3413826"},{"doi-asserted-by":"crossref","unstructured":"Nguyen V-Q, Suganuma M, Okatani T (2020) Efficient attention mechanism for visual dialog that can handle all the interactions between multiple inputs. In: Vedaldi A, Bischof H, Brox T, Frahm J-M (eds) Computer Vision - ECCV 2020. Springer, Cham, pp 223\u2013240","key":"2384_CR48","DOI":"10.1007\/978-3-030-58586-0_14"},{"doi-asserted-by":"publisher","unstructured":"Chen F, Chen X, Meng F, Li P, Zhou J (2021) GoG: Relation-aware graph-over-graph network for visual dialog. In: Findings of the association for computational linguistics: ACL-IJCNLP 2021, pp. 230\u2013243. Association for Computational Linguistics, Online. https:\/\/doi.org\/10.18653\/v1\/2021.findings-acl.20 . https:\/\/aclanthology.org\/2021.findings-acl.20","key":"2384_CR49","DOI":"10.18653\/v1\/2021.findings-acl.20"},{"doi-asserted-by":"crossref","unstructured":"Chen F, Chen X, Xu C, Jiang D (2021) Learning to ground visual objects for visual dialog. In: Findings of the association for computational linguistics: EMNLP 2021, pp. 1081\u20131091. Association for Computational Linguistics, Punta Cana, Dominican Republic. https:\/\/aclanthology.org\/2021.findings-emnlp.93","key":"2384_CR50","DOI":"10.18653\/v1\/2021.findings-emnlp.93"},{"key":"2384_CR51","doi-asserted-by":"publisher","first-page":"399","DOI":"10.1016\/j.neucom.2021.03.104","volume":"449","author":"B Lin","year":"2021","unstructured":"Lin B, Zhu Y, Liang X (2021) Heterogeneous excitation-and-squeeze network for visual dialog. Neurocomputing 449:399\u2013410. https:\/\/doi.org\/10.1016\/j.neucom.2021.03.104","journal-title":"Neurocomputing"},{"doi-asserted-by":"publisher","unstructured":"Chen T, Gao L, Li X, Zhao L, Song J (2022) Context gating with multi-level ranking learning for visual dialog. In: 2022 IEEE international conference on multimedia and expo (ICME), pp. 1\u20136 . https:\/\/doi.org\/10.1109\/ICME52920.2022.9859849","key":"2384_CR52","DOI":"10.1109\/ICME52920.2022.9859849"},{"key":"2384_CR53","doi-asserted-by":"publisher","first-page":"9095","DOI":"10.1109\/TMM.2024.3385997","volume":"26","author":"S Du","year":"2024","unstructured":"Du S, Wang H, Li T, Chen CW (2024) Hybrid graph reasoning with dynamic interaction for visual dialog. IEEE Trans Multimedia 26:9095\u20139108. https:\/\/doi.org\/10.1109\/TMM.2024.3385997","journal-title":"IEEE Trans Multimedia"},{"doi-asserted-by":"crossref","unstructured":"Lin TY, Maire M, Belongie S, Hays J, Zitnick CL (2014) Microsoft coco: common objects in context. European conference on computer vision, 740\u2013755. Springer","key":"2384_CR54","DOI":"10.1007\/978-3-319-10602-1_48"},{"doi-asserted-by":"crossref","unstructured":"Krishna R, Zhu Y, Groth O, Johnson J, Li FF (2017) Visual genome: connecting language and vision using crowdsourced dense image annotations. Int J Comput Vis. 123(1)","key":"2384_CR55","DOI":"10.1007\/s11263-016-0981-7"},{"unstructured":"Kingma DP, Ba J (2014) Adam: a method for stochastic optimization. arXiv preprint arXiv:1412.6980","key":"2384_CR56"},{"unstructured":"Park S, Whang T, Yoon Y, Lim H (2020) Multi-view attention network for visual dialog . https:\/\/arxiv.org\/abs\/2004.14025","key":"2384_CR57"}],"container-title":["Knowledge and Information Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10115-025-02384-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10115-025-02384-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10115-025-02384-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,20]],"date-time":"2025-05-20T12:38:17Z","timestamp":1747744697000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10115-025-02384-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,13]]},"references-count":57,"journal-issue":{"issue":"6","published-print":{"date-parts":[[2025,6]]}},"alternative-id":["2384"],"URL":"https:\/\/doi.org\/10.1007\/s10115-025-02384-8","relation":{},"ISSN":["0219-1377","0219-3116"],"issn-type":[{"type":"print","value":"0219-1377"},{"type":"electronic","value":"0219-3116"}],"subject":[],"published":{"date-parts":[[2025,3,13]]},"assertion":[{"value":"20 September 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 December 2024","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 February 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 March 2025","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"No data was used for the research described in the article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"Not applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics approval"}}]}}