{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T18:14:18Z","timestamp":1775067258215,"version":"3.50.1"},"publisher-location":"Cham","reference-count":34,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030585389","type":"print"},{"value":"9783030585396","type":"electronic"}],"license":[{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2020]]},"DOI":"10.1007\/978-3-030-58539-6_16","type":"book-chapter","created":{"date-parts":[[2020,11,6]],"date-time":"2020-11-06T19:02:46Z","timestamp":1604689366000},"page":"259-274","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":112,"title":["Improving Vision-and-Language Navigation with Image-Text Pairs from the Web"],"prefix":"10.1007","author":[{"given":"Arjun","family":"Majumdar","sequence":"first","affiliation":[]},{"given":"Ayush","family":"Shrivastava","sequence":"additional","affiliation":[]},{"given":"Stefan","family":"Lee","sequence":"additional","affiliation":[]},{"given":"Peter","family":"Anderson","sequence":"additional","affiliation":[]},{"given":"Devi","family":"Parikh","sequence":"additional","affiliation":[]},{"given":"Dhruv","family":"Batra","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2020,11,7]]},"reference":[{"key":"16_CR1","unstructured":"Anderson, P., et al.: On evaluation of embodied navigation agents. arXiv preprint arXiv:1807.06757 (2018)"},{"key":"16_CR2","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Bottom-up and top-down attention for image captioning and visual question answering. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"16_CR3","unstructured":"Anderson, P., Shrivastava, A., Parikh, D., Batra, D., Lee, S.: Chasing ghosts: instruction following as Bayesian state tracking. In: Advances in Neural Information Processing Systems, pp. 369\u2013379 (2019)"},{"key":"16_CR4","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Vision-and-language navigation: interpreting visually-grounded navigation instructions in real environments. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2018)","DOI":"10.1109\/CVPR.2018.00387"},{"key":"16_CR5","doi-asserted-by":"crossref","unstructured":"Chang, A., et al.: Matterport3D: Learning from RGB-D data in indoor environments. In: International Conference on 3D Vision (3DV) (2017)","DOI":"10.1109\/3DV.2017.00081"},{"key":"16_CR6","unstructured":"Chen, X., et al.: Microsoft coco captions: data collection and evaluation server. arXiv preprint arXiv:1504.00325 (2015)"},{"key":"16_CR7","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"16_CR8","unstructured":"Fried, D., et al.: Speaker-follower models for vision-and-language navigation. In: Advances in Neural Information Processing Systems, pp. 3314\u20133325 (2018)"},{"key":"16_CR9","doi-asserted-by":"crossref","unstructured":"Goyal, Y., Khot, T., Summers-Stay, D., Batra, D., Parikh, D.: Making the V in VQA matter: elevating the role of image understanding in visual question answering. In: Conference on Computer Vision and Pattern Recognition (CVPR) (2017)","DOI":"10.1109\/CVPR.2017.670"},{"key":"16_CR10","doi-asserted-by":"crossref","unstructured":"Hao, W., Li, C., Li, X., Carin, L., Gao, J.: Towards learning a generic agent for vision-and-language navigation via pre-training. arXiv preprint arXiv:2002.10638 (2020)","DOI":"10.1109\/CVPR42600.2020.01315"},{"key":"16_CR11","doi-asserted-by":"crossref","unstructured":"Huang, H., Jain, V., Mehta, H., Baldridge, J., Ie, E.: Multi-modal discriminative model for vision-and-language navigation. arXiv preprint arXiv:1905.13358 (2019)","DOI":"10.18653\/v1\/W19-1605"},{"key":"16_CR12","doi-asserted-by":"crossref","unstructured":"Kazemzadeh, S., Ordonez, V., Matten, M., Berg, T.L.: Referit game: referring to objects in photographs of natural scenes. In: EMNLP (2014)","DOI":"10.3115\/v1\/D14-1086"},{"key":"16_CR13","doi-asserted-by":"crossref","unstructured":"Ke, L., et al.: Tactical rewind: self-correction via backtracking in vision-and-language navigation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6741\u20136749 (2019)","DOI":"10.1109\/CVPR.2019.00690"},{"key":"16_CR14","unstructured":"Krishna, R., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. arXiv preprint arXiv:1602.07332 (2016)"},{"key":"16_CR15","doi-asserted-by":"crossref","unstructured":"Li, G., Duan, N., Fang, Y., Jiang, D., Zhou, M.: Unicoder-VL: a universal encoder for vision and language by cross-modal pre-training. arXiv preprint arXiv:1908.06066 (2019)","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"16_CR16","unstructured":"Li, L.H., Yatskar, M., Yin, D., Hsieh, C.J., Chang, K.W.: VisualBERT: a simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557 (2019)"},{"key":"16_CR17","doi-asserted-by":"crossref","unstructured":"Li, X., et al.: Robust navigation with language pretraining and stochastic sampling. arXiv preprint arXiv:1909.02244 (2019)","DOI":"10.18653\/v1\/D19-1159"},{"key":"16_CR18","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: ViLBERT: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In: Advances in Neural Information Processing Systems, pp. 13\u201323 (2019)"},{"key":"16_CR19","unstructured":"Lu, J., Yang, J., Batra, D., Parikh, D.: Hierarchical question-image co-attention for visual question answering. In: Advances in Neural Information Processing Systems, pp. 289\u2013297 (2016)"},{"key":"16_CR20","unstructured":"Ma, C.Y., et al.: Self-monitoring navigation agent via auxiliary progress estimation. arXiv preprint arXiv:1901.03035 (2019)"},{"key":"16_CR21","doi-asserted-by":"crossref","unstructured":"Ma, C.Y., Wu, Z., AlRegib, G., Xiong, C., Kira, Z.: The regretful agent: heuristic-aided navigation through progress estimation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6732\u20136740 (2019)","DOI":"10.1109\/CVPR.2019.00689"},{"key":"16_CR22","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. In: Advances in Neural Information Processing Systems (NIPS) (2015)"},{"issue":"3","key":"16_CR23","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., et al.: ImageNet large scale visual recognition challenge. Int. J. Comput. Vis. 115(3), 211\u2013252 (2015)","journal-title":"Int. J. Comput. Vis."},{"key":"16_CR24","doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., Soricut, R.: Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 2556\u20132565 (2018)","DOI":"10.18653\/v1\/P18-1238"},{"key":"16_CR25","unstructured":"Simonyan, K., Vedaldi, A., Zisserman, A.: Deep inside convolutional networks: visualising image classification models and saliency maps. arXiv preprint arXiv:1312.6034 (2013)"},{"key":"16_CR26","unstructured":"Su, W., et al.: VL-BERT: pre-training of generic visual-linguistic representations. arXiv preprint arXiv:1908.08530 (2019)"},{"key":"16_CR27","doi-asserted-by":"crossref","unstructured":"Tan, H., Bansal, M.: LXMERT: learning cross-modality encoder representations from transformers. In: EMNLP (2019)","DOI":"10.18653\/v1\/D19-1514"},{"key":"16_CR28","doi-asserted-by":"crossref","unstructured":"Tan, H., Yu, L., Bansal, M.: Learning to navigate unseen environments: back translation with environmental dropout. arXiv preprint arXiv:1904.04195 (2019)","DOI":"10.18653\/v1\/N19-1268"},{"key":"16_CR29","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, pp. 5998\u20136008 (2017)"},{"key":"16_CR30","doi-asserted-by":"crossref","unstructured":"Wang, X., et al.: Reinforced cross-modal matching and self-supervised imitation learning for vision-language navigation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6629\u20136638 (2019)","DOI":"10.1109\/CVPR.2019.00679"},{"key":"16_CR31","doi-asserted-by":"crossref","unstructured":"Wang, X., Xiong, W., Wang, H., Yang Wang, W.: Look before you leap: bridging model-free and model-based reinforcement learning for planned-ahead vision-and-language navigation. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 37\u201353 (2018)","DOI":"10.1007\/978-3-030-01270-0_3"},{"key":"16_CR32","doi-asserted-by":"crossref","unstructured":"Zhou, L., Palangi, H., Zhang, L., Hu, H., Corso, J.J., Gao, J.: Unified vision-language pre-training for image captioning and VQA. arXiv preprint arXiv:1909.11059 (2019)","DOI":"10.1609\/aaai.v34i07.7005"},{"key":"16_CR33","doi-asserted-by":"crossref","unstructured":"Zhu, F., Zhu, Y., Chang, X., Liang, X.: Vision-language navigation with self-supervised auxiliary reasoning tasks. arXiv preprint arXiv:1911.07883 (2019)","DOI":"10.1109\/CVPR42600.2020.01003"},{"key":"16_CR34","doi-asserted-by":"crossref","unstructured":"Zhu, Y., et al.: Aligning books and movies: towards story-like visual explanations by watching movies and reading books. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 19\u201327 (2015)","DOI":"10.1109\/ICCV.2015.11"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2020"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-58539-6_16","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,6]],"date-time":"2024-11-06T00:12:08Z","timestamp":1730851928000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-58539-6_16"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020]]},"ISBN":["9783030585389","9783030585396"],"references-count":34,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-58539-6_16","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020]]},"assertion":[{"value":"7 November 2020","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Glasgow","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"United Kingdom","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2020","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 August 2020","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 August 2020","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2020","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2020.eu\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"OpenReview","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5025","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1360","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"27% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"7","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"The conference was held virtually due to the COVID-19 pandemic.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}