{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,25]],"date-time":"2026-06-25T23:50:17Z","timestamp":1782431417135,"version":"3.54.5"},"publisher-location":"Cham","reference-count":64,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030585228","type":"print"},{"value":"9783030585235","type":"electronic"}],"license":[{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2020]]},"DOI":"10.1007\/978-3-030-58523-5_20","type":"book-chapter","created":{"date-parts":[[2020,12,3]],"date-time":"2020-12-03T20:13:16Z","timestamp":1607026396000},"page":"336-352","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":59,"title":["Large-Scale Pretraining for Visual Dialog: A Simple State-of-the-Art Baseline"],"prefix":"10.1007","author":[{"given":"Vishvak","family":"Murahari","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Dhruv","family":"Batra","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Devi","family":"Parikh","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Abhishek","family":"Das","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2020,12,4]]},"reference":[{"key":"20_CR1","doi-asserted-by":"crossref","unstructured":"de Vries, H., Strub, F., Chandar, S., Pietquin, O., Larochelle, H., Courville, A.: GuessWhat?! visual object discovery through multi-modal dialogue. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.475"},{"key":"20_CR2","doi-asserted-by":"crossref","unstructured":"Das, A., et al.: Visual dialog. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.121"},{"key":"20_CR3","doi-asserted-by":"crossref","unstructured":"Strub, F., De Vries, H., Mary, J., Piot, B., Courville, A., Pietquin, O.: End-to-end optimization of goal-driven and visually grounded dialogue systems, arXiv preprint arXiv:1703.05423 (2017)","DOI":"10.24963\/ijcai.2017\/385"},{"key":"20_CR4","doi-asserted-by":"crossref","unstructured":"Das, A., Kottur, S., Moura, J.M., Lee, S., Batra, D.: Learning cooperative visual dialog agents with deep reinforcement learning. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.321"},{"key":"20_CR5","unstructured":"Lu, J., Kannan, A., Yang, J., Parikh, D., Batra, D.: Best of both worlds: transferring knowledge from discriminative learning to a generative visual dialog model. In: NIPS (2017)"},{"key":"20_CR6","unstructured":"Massiceti, D., Siddharth, N., Dokania, P.K., Torr, P.H.: FLIPDIAL: a generative model for two-way visual dialogue. In: CVPR (2018)"},{"key":"20_CR7","doi-asserted-by":"crossref","unstructured":"Wu, Q., Wang, P., Shen, C., Reid, I., van den Hengel, A.: Are you talking to me? reasoned visual dialog generation through adversarial learning. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00639"},{"key":"20_CR8","doi-asserted-by":"crossref","unstructured":"Jain, U., Lazebnik, S., Schwing, A.G.: Two can play this game: visual dialog with discriminative question generation and answering. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00603"},{"key":"20_CR9","doi-asserted-by":"crossref","unstructured":"Kottur, S., Moura, J.M., Parikh, D., Batra, D., Rohrbach, M.: Visual coreference resolution in visual dialog using neural module networks. In: ECCV (2018)","DOI":"10.1007\/978-3-030-01267-0_10"},{"key":"20_CR10","unstructured":"Lee, S.-W., Gao, T., Yang, S., Yoo, J., Ha, J.-W.: Large-scale answerer in questioner\u2019s mind for visual dialog question generation. In: ICLR (2019)"},{"key":"20_CR11","doi-asserted-by":"crossref","unstructured":"Niu, Y., Zhang, H., Zhang, M., Zhang, J., Lu, Z., Wen, J.-R.: Recursive visual attention in visual dialog. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00684"},{"key":"20_CR12","doi-asserted-by":"crossref","unstructured":"Zheng, Z., Wang, W., Qi, S., Zhu, S.-C.: Reasoning visual dialogs with structural and partial observations. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00683"},{"key":"20_CR13","doi-asserted-by":"crossref","unstructured":"Schwartz, I., Yu, S., Hazan, T., Schwing, A.G.: Factor graph attention. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00214"},{"key":"20_CR14","doi-asserted-by":"crossref","unstructured":"Kang, G.-C., Lim, J., Zhang, B.-T.: Dual attention networks for visual reference resolution in visual dialog. In: EMNLP (2019)","DOI":"10.18653\/v1\/D19-1209"},{"key":"20_CR15","doi-asserted-by":"crossref","unstructured":"Gan, Z., Cheng, Y., Kholy, A.E., Li, L., Liu, J., Gao, J.: Multi-step reasoning via recurrent dual attention for visual dialog. In: ACL (2019)","DOI":"10.18653\/v1\/P19-1648"},{"key":"20_CR16","unstructured":"Kottur, S., Moura, J.M., Parikh, D., Batra, D., Rohrbach, M.: CLEVR-dialog: a diagnostic dataset for multi-round reasoning in visual dialog. In: NAACL (2019)"},{"key":"20_CR17","doi-asserted-by":"crossref","unstructured":"Murahari, V., Chattopadhyay, P., Batra, D., Parikh, D., Das, A.: Improving generative visual dialog by answering diverse questions. In: EMNLP (2019)","DOI":"10.18653\/v1\/D19-1152"},{"key":"20_CR18","doi-asserted-by":"crossref","unstructured":"Shekhar, R., et al.: Beyond task success: a closer look at jointly learning to see, ask, and guesswhat. In: NAACL (2019)","DOI":"10.18653\/v1\/N19-1265"},{"key":"20_CR19","doi-asserted-by":"crossref","unstructured":"Yang, T., Zha, Z.-J., Zhang, H.: Making history matter: gold-critic sequence training for visual dialog. arXiv preprint arXiv:1902.09326 (2019)","DOI":"10.1109\/ICCV.2019.00265"},{"key":"20_CR20","doi-asserted-by":"crossref","unstructured":"Guo, D., Xu, C., Tao, D.: Image-question-answer synergistic network for visual dialog. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.01068"},{"key":"20_CR21","doi-asserted-by":"crossref","unstructured":"Qi, J., Niu, Y., Huang, J., Zhang, H.: Two causal principles for improving visual dialog, arXiv preprint arXiv:1911.10496 (2019)","DOI":"10.1109\/CVPR42600.2020.01087"},{"key":"20_CR22","doi-asserted-by":"crossref","unstructured":"Jiang, X., Yu, J., et al.: DualVD: an adaptive dual encoding model for deep visual understanding in visual dialogue. In: AAAI (2020)","DOI":"10.1609\/aaai.v34i07.6769"},{"key":"20_CR23","doi-asserted-by":"crossref","unstructured":"Alamri, H., et al.: Audio visual scene-aware dialog. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00774"},{"key":"20_CR24","unstructured":"de Vries, H., Shuster, K., Batra, D., Parikh, D., Weston, J., Kiela, D.: Talk the walk: navigating new york city through grounded dialogue, arXiv preprint arXiv:1807.03367 (2018)"},{"key":"20_CR25","doi-asserted-by":"crossref","unstructured":"Nguyen, K., Daum\u00e9 III, H.: Help, anna! visual navigation with natural multimodal assistance via retrospective curiosity-encouraging imitation learning. In: EMNLP (2019)","DOI":"10.18653\/v1\/D19-1063"},{"key":"20_CR26","unstructured":"Thomason, J., Murray, M., Cakmak, M., Zettlemoyer, L.: Vision-and-dialog navigation (2019)"},{"key":"20_CR27","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Kiros, R., et al.: Aligning books and movies: towards story-like visual explanations by watching movies and reading books. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.11"},{"key":"20_CR28","doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., Soricut, R.: Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: ACL (2018)","DOI":"10.18653\/v1\/P18-1238"},{"key":"20_CR29","doi-asserted-by":"crossref","unstructured":"Antol, S., et al.: VQA: visual question answering. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"20_CR30","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"20_CR31","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. In: ICLR (2015)"},{"key":"20_CR32","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.: ImageNet classification with deep convolutional neural networks. In: NIPS (2012)"},{"key":"20_CR33","unstructured":"Radford, A., Narasimhan, K., Salimans, T., Sutskever, I.: Improving language understanding with unsupervised learning (2018)"},{"key":"20_CR34","unstructured":"Vaswani, A., et al.: Attention is all you need. In: NIPS (2017)"},{"key":"20_CR35","unstructured":"Devlin, J., Chang, M.-W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: NAACL (2019)"},{"key":"20_CR36","unstructured":"Liu, Y., et al.: RoBERTa: a robustly optimized BERT pretraining approach, arXiv preprint arXiv:1907.11692 (2019)"},{"key":"20_CR37","unstructured":"Lan, Z., Chen, M., Goodman, S., Gimpel, K., Sharma, P., Soricut, R.: ALBERT: a lite BERT for self-supervised learning of language representations, arXiv preprint arXiv:1909.11942 (2019)"},{"key":"20_CR38","unstructured":"Yang, Z., Dai, Z., Yang, Y., Carbonell, J., Salakhutdinov, R., Le, Q.L.: XLNET: generalized autoregressive pretraining for language understanding, arXiv preprint arXiv:1906.08237 (2019)"},{"key":"20_CR39","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer, arXiv preprint arXiv:1910.10683 (2019)"},{"key":"20_CR40","doi-asserted-by":"crossref","unstructured":"Zhang, Y., et al.: DialoGPT: large-scale generative pre-training for conversational response generation, arXiv preprint arXiv:1911.00536 (2019)","DOI":"10.18653\/v1\/2020.acl-demos.30"},{"key":"20_CR41","doi-asserted-by":"crossref","unstructured":"Russakovsky, O., et al.: ImageNet large scale visual recognition challenge. IJCV (2015)","DOI":"10.1007\/s11263-015-0816-y"},{"key":"20_CR42","doi-asserted-by":"crossref","unstructured":"Krishna, R., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. IJCV (2017)","DOI":"10.1007\/s11263-016-0981-7"},{"key":"20_CR43","doi-asserted-by":"crossref","unstructured":"Wang, A., Singh, A., Michael, J., Hill, F., Levy, O., Bowman, S.R.: GLUE: a multi-task benchmark and analysis platform for natural language understanding, arXiv preprint arXiv:1804.07461 (2018)","DOI":"10.18653\/v1\/W18-5446"},{"key":"20_CR44","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: ViLBERT: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In: NeurIPS (2019)"},{"key":"20_CR45","unstructured":"Li, L.H., Yatskar, M., Yin, D., Hsieh, C.-J., Chang, K.-W.: VisualBERT: a simple and performant baseline for vision and language, arXiv preprint arXiv:1908.03557 (2019)"},{"key":"20_CR46","doi-asserted-by":"crossref","unstructured":"Tan, H., Bansal, M.: LXMERT: learning cross-modality encoder representations from transformers, arXiv preprint arXiv:1908.07490 (2019)","DOI":"10.18653\/v1\/D19-1514"},{"key":"20_CR47","unstructured":"Chen, Y.-C., et al.: UNITER: Learning UNiversal Image-TExt Representations, arXiv preprint arXiv:1909.11740 (2019)"},{"key":"20_CR48","doi-asserted-by":"crossref","unstructured":"Li, G., Duan, N., Fang, Y., Jiang, D., Zhou, M.: Unicoder-VL: a universal encoder for vision and language by cross-modal pre-training, arXiv preprint arXiv:1908.06066 (2019)","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"20_CR49","unstructured":"Su, W., et al.: VL-BERT: pre-training of generic visual-linguistic representations, arXiv preprint arXiv:1908.08530 (2019)"},{"key":"20_CR50","doi-asserted-by":"crossref","unstructured":"Sun, C., Myers, A., Vondrick, C., Murphy, K., Schmid, C.: VideoBERT: a joint model for video and language representation learning, arXiv preprint arXiv:1904.01766 (2019)","DOI":"10.1109\/ICCV.2019.00756"},{"key":"20_CR51","doi-asserted-by":"crossref","unstructured":"Zellers, R., Bisk, Y., Farhadi, A., Choi, Y.: From recognition to cognition: visual commonsense reasoning. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00688"},{"key":"20_CR52","doi-asserted-by":"crossref","unstructured":"Suhr, A., Zhou, S., Zhang, A., Zhang, I., Bai, H., Artzi, Y.: A corpus for reasoning about natural language grounded in photographs. In: ACL (2019)","DOI":"10.18653\/v1\/P19-1644"},{"key":"20_CR53","unstructured":"Xie, N., Lai, F., Doran, D., Kadav, A.: Visual entailment: a novel task for fine-grained image understanding, arXiv preprint arXiv:1901.06706 (2019)"},{"key":"20_CR54","doi-asserted-by":"crossref","unstructured":"Young, P., Lai, A., Hodosh, M., Hockenmaier, J.: From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions. In: TACL (2014)","DOI":"10.1162\/tacl_a_00166"},{"key":"20_CR55","doi-asserted-by":"crossref","unstructured":"Lee, K.-H., Chen, X., Hua, G., Hu, H., He, X.: Stacked cross attention for image-text matching. In: ECCV (2018)","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"20_CR56","doi-asserted-by":"crossref","unstructured":"Kazemzadeh, S., Ordonez, V., Matten, M., Berg, T.L.: ReferItGame: referring to objects in photographs of natural scenes. In: EMNLP (2014)","DOI":"10.3115\/v1\/D14-1086"},{"key":"20_CR57","doi-asserted-by":"crossref","unstructured":"Hao, W., Li, C., Li, X., Carin, L., Gao, J.: Towards learning a generic agent for vision-and-language navigation via pre-training. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01315"},{"key":"20_CR58","unstructured":"Paszke, A., et al.: Pytorch: an imperative style, high-performance deep learning library. In: Wallach, H., Larochelle, H., Beygelzimer, A., d\u2019Alch\u00e9-Buc, F., Fox, E., Garnett, R. (eds.) Advances in Neural Information Processing Systems, vol. 32, pp. 8024\u20138035, Curran Associates Inc. (2019)"},{"key":"20_CR59","unstructured":"Wolf, T., Sanh, V., Chaumond, J., Delangue, C.: TransferTransfo: a transfer learning approach for neural network based conversational agents, arXiv preprint arXiv:1901.08149 (2019)"},{"key":"20_CR60","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Bottom-up and top-down attention for image captioning and visual question answering, arXiv preprint arXiv:1707.07998 (2017)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"20_CR61","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. In: NIPS (2015)"},{"key":"20_CR62","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: ECCV (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"20_CR63","unstructured":"Kingma, D., Ba, J.: Adam: a method for stochastic optimization. In: ICLR (2015)"},{"key":"20_CR64","doi-asserted-by":"crossref","unstructured":"Lewis, M., et al.: Bart: denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension, arXiv preprint arXiv:1910.13461 (2019)","DOI":"10.18653\/v1\/2020.acl-main.703"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2020"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-58523-5_20","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T00:06:41Z","timestamp":1733184401000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-58523-5_20"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020]]},"ISBN":["9783030585228","9783030585235"],"references-count":64,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-58523-5_20","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020]]},"assertion":[{"value":"4 December 2020","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Glasgow","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"United Kingdom","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2020","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 August 2020","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 August 2020","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2020","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2020.eu\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"OpenReview","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5025","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1360","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"27% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"7","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"The conference was held virtually due to the COVID-19 pandemic. From the ECCV Workshops 249 full papers, 18 short papers, and 21 further contributions were published out of a total of 467 submissions.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}