{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,16]],"date-time":"2026-04-16T03:38:04Z","timestamp":1776310684779,"version":"3.50.1"},"publisher-location":"Cham","reference-count":52,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030585853","type":"print"},{"value":"9783030585860","type":"electronic"}],"license":[{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2020]]},"DOI":"10.1007\/978-3-030-58586-0_14","type":"book-chapter","created":{"date-parts":[[2020,11,29]],"date-time":"2020-11-29T17:02:42Z","timestamp":1606669362000},"page":"223-240","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":37,"title":["Efficient Attention Mechanism for Visual Dialog that Can Handle All the Interactions Between Multiple Inputs"],"prefix":"10.1007","author":[{"given":"Van-Quang","family":"Nguyen","sequence":"first","affiliation":[]},{"given":"Masanori","family":"Suganuma","sequence":"additional","affiliation":[]},{"given":"Takayuki","family":"Okatani","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2020,11,30]]},"reference":[{"key":"14_CR1","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6077\u20136086 (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"14_CR2","doi-asserted-by":"crossref","unstructured":"Antol, S., et al.: VQA: visual question answering. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2425\u20132433 (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"14_CR3","doi-asserted-by":"crossref","unstructured":"Chattopadhyay, P., et al.: Evaluating visual conversational agents via cooperative human-AI games. In: Proceedings of AAAI Conference on Human Computation and Crowdsourcing (2017)","DOI":"10.1609\/hcomp.v5i1.13312"},{"key":"14_CR4","unstructured":"Chen, K., Wang, J., Chen, L.C., Gao, H., Xu, W., Nevatia, R.: ABC-CNN: an attention based convolutional neural network for visual question answering. arXiv preprint arXiv:1511.05960 (2015)"},{"key":"14_CR5","unstructured":"Chen, X., et al.: Microsoft COCO captions: data collection and evaluation server. arXiv preprint arXiv:1504.00325 (2015)"},{"key":"14_CR6","unstructured":"Chen, Y.C., et al.: UNITER: learning universal image-text representations. arXiv preprint arXiv:1909.11740 (2019)"},{"key":"14_CR7","doi-asserted-by":"crossref","unstructured":"Das, A., et al.: Visual dialog. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 326\u2013335 (2017)","DOI":"10.1109\/CVPR.2017.121"},{"key":"14_CR8","doi-asserted-by":"crossref","unstructured":"Das, A., Kottur, S., Moura, J.M., Lee, S., Batra, D.: Learning cooperative visual dialog agents with deep reinforcement learning. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2951\u20132960 (2017)","DOI":"10.1109\/ICCV.2017.321"},{"key":"14_CR9","doi-asserted-by":"crossref","unstructured":"De Vries, H., Strub, F., Chandar, S., Pietquin, O., Larochelle, H., Courville, A.: Guesswhat?! visual object discovery through multi-modal dialogue. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5503\u20135512 (2017)","DOI":"10.1109\/CVPR.2017.475"},{"key":"14_CR10","doi-asserted-by":"crossref","unstructured":"Deng, C., Wu, Q., Wu, Q., Hu, F., Lyu, F., Tan, M.: Visual grounding via accumulated attention. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7746\u20137755 (2018)","DOI":"10.1109\/CVPR.2018.00808"},{"key":"14_CR11","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"14_CR12","doi-asserted-by":"crossref","unstructured":"Gan, Z., Cheng, Y., Kholy, A.E., Li, L., Liu, J., Gao, J.: Multi-step reasoning via recurrent dual attention for visual dialog. In: Proceedings of the Conference of the Association for Computational Linguistics, pp. 6463\u20136474 (2019)","DOI":"10.18653\/v1\/P19-1648"},{"key":"14_CR13","doi-asserted-by":"crossref","unstructured":"Gao, P., et al.: Dynamic fusion with intra-and inter-modality attention flow for visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6639\u20136648 (2019)","DOI":"10.1109\/CVPR.2019.00680"},{"key":"14_CR14","doi-asserted-by":"crossref","unstructured":"Guo, D., Xu, C., Tao, D.: Image-question-answer synergistic network for visual dialog. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 10434\u201310443 (2019)","DOI":"10.1109\/CVPR.2019.01068"},{"key":"14_CR15","doi-asserted-by":"crossref","unstructured":"Hori, C., et al.: End-to-end audio visual scene-aware dialog using multimodal attention-based video features. In: ICASSP 2019\u20132019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 2352\u20132356 (2019)","DOI":"10.1109\/ICASSP.2019.8682583"},{"key":"14_CR16","unstructured":"Ilievski, I., Yan, S., Feng, J.: A focused dynamic attention model for visual question answering. arXiv preprint arXiv:1604.01485 (2016)"},{"key":"14_CR17","doi-asserted-by":"crossref","unstructured":"Jain, U., Lazebnik, S., Schwing, A.G.: Two can play this game: visual dialog with discriminative question generation and answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5754\u20135763 (2018)","DOI":"10.1109\/CVPR.2018.00603"},{"key":"14_CR18","doi-asserted-by":"crossref","unstructured":"Kang, G.C., Lim, J., Zhang, B.T.: Dual attention networks for visual reference resolution in visual dialog. In: Proceedings of the Conference on Empirical Methods in Natural Language Processing, pp. 2024\u20132033 (2019)","DOI":"10.18653\/v1\/D19-1209"},{"key":"14_CR19","doi-asserted-by":"crossref","unstructured":"Kim, H., Tan, H., Bansal, M.: Modality-balanced models for visual dialogue. arXiv preprint arXiv:2001.06354 (2020)","DOI":"10.1609\/aaai.v34i05.6320"},{"key":"14_CR20","unstructured":"Kim, J.H., Jun, J., Zhang, B.T.: Bilinear attention networks. In: Advances in Neural Information Processing Systems, pp. 1564\u20131574 (2018)"},{"key":"14_CR21","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)"},{"key":"14_CR22","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"160","DOI":"10.1007\/978-3-030-01267-0_10","volume-title":"Computer Vision \u2013 ECCV 2018","author":"S Kottur","year":"2018","unstructured":"Kottur, S., Moura, J.M.F., Parikh, D., Batra, D., Rohrbach, M.: Visual coreference resolution in visual dialog using neural module networks. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11219, pp. 160\u2013178. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01267-0_10"},{"key":"14_CR23","unstructured":"Kottur, S., Moura, J.M., Parikh, D., Batra, D., Rohrbach, M.: CLEVR-dialog: a diagnostic dataset for multi-round reasoning in visual dialog. arXiv preprint arXiv:1903.03166 (2019)"},{"issue":"1","key":"14_CR24","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vis. 123(1), 32\u201373 (2017)","journal-title":"Int. J. Comput. Vis."},{"key":"14_CR25","unstructured":"Lan, Z., Chen, M., Goodman, S., Gimpel, K., Sharma, P., Soricut, R.: ALBERT: a lite BERT for self-supervised learning of language representations. arXiv preprint arXiv:1909.11942 (2019)"},{"key":"14_CR26","unstructured":"Li, L.H., Yatskar, M., Yin, D., Hsieh, C.J., Chang, K.W.: VisualBERT: a simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557 (2019)"},{"key":"14_CR27","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: ViLBERT: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. arXiv preprint arXiv:1908.02265 (2019)"},{"key":"14_CR28","unstructured":"Lu, J., Kannan, A., Yang, J., Parikh, D., Batra, D.: Best of both worlds: transferring knowledge from discriminative learning to a generative visual dialog model. In: Advances in Neural Information Processing Systems, pp. 314\u2013324 (2017)"},{"key":"14_CR29","unstructured":"Lu, J., Yang, J., Batra, D., Parikh, D.: Hierarchical question-image co-attention for visual question answering. In: Advances in Neural Information Processing Systems, pp. 289\u2013297 (2016)"},{"key":"14_CR30","doi-asserted-by":"crossref","unstructured":"Murahari, V., Batra, D., Parikh, D., Das, A.: Large-scale pretraining for visual dialog: a simple state-of-the-art baseline. arXiv preprint arXiv:1912.02379 (2019)","DOI":"10.1007\/978-3-030-58523-5_20"},{"key":"14_CR31","doi-asserted-by":"crossref","unstructured":"Nguyen, D.K., Okatani, T.: Improved fusion of visual and language representations by dense symmetric co-attention for visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6087\u20136096 (2018)","DOI":"10.1109\/CVPR.2018.00637"},{"key":"14_CR32","doi-asserted-by":"crossref","unstructured":"Niu, Y., Zhang, H., Zhang, M., Zhang, J., Lu, Z., Wen, J.R.: Recursive visual attention in visual dialog. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6679\u20136688 (2019)","DOI":"10.1109\/CVPR.2019.00684"},{"key":"14_CR33","doi-asserted-by":"crossref","unstructured":"Pennington, J., Socher, R., Manning, C.: Glove: global vectors for word representation. In: Proceedings of the Conference on Empirical Methods in Natural Language Processing, pp. 1532\u20131543 (2014)","DOI":"10.3115\/v1\/D14-1162"},{"key":"14_CR34","doi-asserted-by":"crossref","unstructured":"Qi, J., Niu, Y., Huang, J., Zhang, H.: Two causal principles for improving visual dialog. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10860\u201310869 (2020)","DOI":"10.1109\/CVPR42600.2020.01087"},{"key":"14_CR35","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. In: Advances in Neural Information Processing Systems, pp. 91\u201399 (2015)"},{"key":"14_CR36","doi-asserted-by":"crossref","unstructured":"Schwartz, I., Yu, S., Hazan, T., Schwing, A.G.: Factor graph attention. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2039\u20132048 (2019)","DOI":"10.1109\/CVPR.2019.00214"},{"key":"14_CR37","unstructured":"Seo, P.H., Lehrmann, A., Han, B., Sigal, L.: Visual reference resolution using attention memory for visual dialog. In: Advances in Neural Information Processing Systems, pp. 3719\u20133729 (2017)"},{"key":"14_CR38","doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., Soricut, R.: Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: Proceedings of the Annual Meeting of the Association for Computational Linguistics, pp. 2556\u20132565 (2018)","DOI":"10.18653\/v1\/P18-1238"},{"key":"14_CR39","unstructured":"Sutskever, I., Vinyals, O., Le, Q.V.: Sequence to sequence learning with neural networks. In: Advances in Neural Information Processing Systems, pp. 3104\u20133112 (2014)"},{"key":"14_CR40","doi-asserted-by":"crossref","unstructured":"Tan, H., Bansal, M.: LXMERT: learning cross-modality encoder representations from transformers. In: Proceedings of the Conference on Empirical Methods in Natural Language Processing (2019)","DOI":"10.18653\/v1\/D19-1514"},{"key":"14_CR41","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, pp. 5998\u20136008 (2017)"},{"key":"14_CR42","doi-asserted-by":"crossref","unstructured":"Wang, Y., Joty, S., Lyu, M.R., King, I., Xiong, C., Hoi, S.C.: VD-BERT: a unified vision and dialog transformer with BERT. arXiv preprint arXiv:2004.13278 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.269"},{"key":"14_CR43","doi-asserted-by":"crossref","unstructured":"Wu, Q., Wang, P., Shen, C., Reid, I., van den Hengel, A.: Are you talking to me? Reasoned visual dialog generation through adversarial learning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6106\u20136115 (2018)","DOI":"10.1109\/CVPR.2018.00639"},{"key":"14_CR44","doi-asserted-by":"crossref","unstructured":"Yang, T., Zha, Z.J., Zhang, H.: Making history matter: history-advantage sequence training for visual dialog. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2561\u20132569 (2019)","DOI":"10.1109\/ICCV.2019.00265"},{"key":"14_CR45","doi-asserted-by":"crossref","unstructured":"Yang, Z., He, X., Gao, J., Deng, L., Smola, A.: Stacked attention networks for image question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 21\u201329 (2016)","DOI":"10.1109\/CVPR.2016.10"},{"key":"14_CR46","doi-asserted-by":"crossref","unstructured":"Yu, L., et al.: MAttNet: modular attention network for referring expression comprehension. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1307\u20131315 (2018)","DOI":"10.1109\/CVPR.2018.00142"},{"key":"14_CR47","doi-asserted-by":"crossref","unstructured":"Yu, Z., Yu, J., Cui, Y., Tao, D., Tian, Q.: Deep modular co-attention networks for visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6281\u20136290 (2019)","DOI":"10.1109\/CVPR.2019.00644"},{"key":"14_CR48","doi-asserted-by":"crossref","unstructured":"Yu, Z., Yu, J., Fan, J., Tao, D.: Multi-modal factorized bilinear pooling with co-attention learning for visual question answering. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1821\u20131830 (2017)","DOI":"10.1109\/ICCV.2017.202"},{"issue":"12","key":"14_CR49","doi-asserted-by":"publisher","first-page":"5947","DOI":"10.1109\/TNNLS.2018.2817340","volume":"29","author":"Z Yu","year":"2018","unstructured":"Yu, Z., Yu, J., Xiang, C., Fan, J., Tao, D.: Beyond bilinear: generalized multimodal factorized high-order pooling for visual question answering. IEEE Trans. Neural Netw. Learn. Syst. 29(12), 5947\u20135959 (2018)","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"14_CR50","doi-asserted-by":"crossref","unstructured":"Zhang, H., et al.: Generative visual dialogue system via weighted likelihood estimation. In: Proceedings of the International Joint Conference on Artificial Intelligence, pp. 1025\u20131031 (2019)","DOI":"10.24963\/ijcai.2019\/144"},{"key":"14_CR51","doi-asserted-by":"crossref","unstructured":"Zheng, Z., Wang, W., Qi, S., Zhu, S.C.: Reasoning visual dialogs with structural and partial observations. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6669\u20136678 (2019)","DOI":"10.1109\/CVPR.2019.00683"},{"key":"14_CR52","doi-asserted-by":"crossref","unstructured":"Zhuang, B., Wu, Q., Shen, C., Reid, I., van den Hengel, A.: Parallel attention: a unified framework for visual object discovery through dialogs and queries. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4252\u20134261 (2018)","DOI":"10.1109\/CVPR.2018.00447"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2020"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-58586-0_14","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,29]],"date-time":"2024-11-29T00:06:17Z","timestamp":1732838777000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-58586-0_14"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020]]},"ISBN":["9783030585853","9783030585860"],"references-count":52,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-58586-0_14","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020]]},"assertion":[{"value":"30 November 2020","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Glasgow","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"United Kingdom","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2020","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 August 2020","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 August 2020","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2020","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2020.eu\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"OpenReview","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5025","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1360","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"27% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"7","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"The conference was held virtually due to the COVID-19 pandemic. From the ECCV Workshops 249 full papers, 18 short papers, and 21 further contributions were published out of a total of 467 submissions.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}