{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,21]],"date-time":"2026-03-21T20:16:59Z","timestamp":1774124219881,"version":"3.50.1"},"reference-count":66,"publisher":"IEEE","license":[{"start":{"date-parts":[[2022,6,1]],"date-time":"2022-06-01T00:00:00Z","timestamp":1654041600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,6,1]],"date-time":"2022-06-01T00:00:00Z","timestamp":1654041600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022,6]]},"DOI":"10.1109\/cvpr52688.2022.01499","type":"proceedings-article","created":{"date-parts":[[2022,9,27]],"date-time":"2022-09-27T19:56:41Z","timestamp":1664308601000},"page":"15407-15417","source":"Crossref","is-referenced-by-count":35,"title":["Less is More: Generating Grounded Navigation Instructions from Landmarks"],"prefix":"10.1109","author":[{"given":"Su","family":"Wang","sequence":"first","affiliation":[{"name":"Google Research"}]},{"given":"Ceslee","family":"Montgomery","sequence":"additional","affiliation":[{"name":"Google Research"}]},{"given":"Jordi","family":"Orbay","sequence":"additional","affiliation":[{"name":"Google Research"}]},{"given":"Vighnesh","family":"Birodkar","sequence":"additional","affiliation":[{"name":"Google Research"}]},{"given":"Aleksandra","family":"Faust","sequence":"additional","affiliation":[{"name":"Google Research"}]},{"given":"Izzeddin","family":"Gur","sequence":"additional","affiliation":[{"name":"Google Research"}]},{"given":"Natasha","family":"Jaques","sequence":"additional","affiliation":[{"name":"Google Research"}]},{"given":"Austin","family":"Waters","sequence":"additional","affiliation":[{"name":"Google Research"}]},{"given":"Jason","family":"Baldridge","sequence":"additional","affiliation":[{"name":"Google Research"}]},{"given":"Peter","family":"Anderson","sequence":"additional","affiliation":[{"name":"Google Research"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Visual landmark selection for gener-ating grounded and interpretable navigation instructions","volume-title":"CVPR workshop on Deep Learning for Semantic Visual Nav-igation","author":"Agarwal","year":"2019"},{"key":"ref2","article-title":"On evaluation of embodied navi-gation agents","volume":"abs\/1807.06757","author":"Peter","year":"2018","journal-title":"CoRR"},{"key":"ref3","article-title":"SPICE: Semantic Propositional Image Caption Evaluation","author":"Anderson","year":"2016","journal-title":"ECCV"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00387"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00387"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/s10339-007-0199-2"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.3389\/fpsyg.2012.00304"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/3DV.2017.00081"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"ref11","article-title":"History aware multimodal transformer for vision-and-language navigation","author":"Chen","year":"2021","journal-title":"NeurIPS"},{"key":"ref12","article-title":"Microsoft COCO Captions: Data Collection and Evaluation Server","author":"Chen","year":"2015","journal-title":"arXiv preprint"},{"key":"ref13","article-title":"Unifying vision-and-language tasks via text generation","author":"Cho","year":"2021","journal-title":"Proceedings of ICML 2021"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1145\/2909824.3020241"},{"key":"ref15","article-title":"Causal confusion in imitation learning","author":"Haan","year":"2019","journal-title":"NeurIPS"},{"key":"ref16","article-title":"BERT: Pre-training of deep bidirectional trans-formers for language understanding","author":"Devlin","year":"2019","journal-title":"NAACL"},{"key":"ref17","first-page":"757","article-title":"Generation of landmark-based navigation instructions from open-source data","volume-title":"Proceedings of the 13th Conference of the European Chapter of the Association for Computational Linguistics","author":"Drager"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1098\/rstb.2012.0533"},{"key":"ref19","author":"Fellner","year":"2017","journal-title":"Automatic generation of landmark-based in-door routing instructions"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1155\/2019\/9345861"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1037\/0278-7393.31.2.195"},{"key":"ref22","article-title":"Speaker-follower models for vision-and-language navigation","author":"Fried","year":"2018","journal-title":"NeurIPS"},{"key":"ref23","article-title":"Speaker-Follower Models for Vision-and-Language Navigation","volume-title":"Proceedings of NeurIPS","author":"Fried","year":"2018"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"key":"ref25","article-title":"Wiki-40b: Multilingual language model dataset","author":"Guo","year":"2020","journal-title":"LREC 2020"},{"key":"ref26","first-page":"235","article-title":"A proposal for a configurable silver standard","volume-title":"Proceedings of the Fourth Linguistic Annotation Workshop","author":"Hahn"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-01516-8_6"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref29","first-page":"7685","article-title":"editors","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Hong","year":"2020"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/3173574.3173851"},{"key":"ref31","article-title":"Effective and general evaluation for in-struction conditioned navigation using dynamic time warping","volume-title":"NeurIPS Visually Grounded Interaction and Language Workshop","author":"Ilharco","year":"2019"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-emnlp.293"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.3115\/1075096.1075150"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1007\/11556114_22"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00428"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.356"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.alvr-1.5"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/d18-2012"},{"key":"ref39","article-title":"Generative language-grounded policy in vision-and-language navigation with bayes\u2019 rule","author":"Kurita","year":"2021","journal-title":"ICLR"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_45"},{"key":"ref41","article-title":"Microsoft COCO: Com-mon Objects in Context","author":"Lin","year":"2014","journal-title":"ECCV"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-01790-7_11"},{"key":"ref43","article-title":"Stacked hour-glass networks for human pose estimation","author":"Newell","year":"2016","journal-title":"ECCV"},{"key":"ref44","article-title":"Bleu: a method for automatic evaluation of machine translation","author":"Papineni","year":"2002","journal-title":"ACL"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01564"},{"key":"ref46","article-title":"Connecting vision and language with localized narratives","author":"Pont-Tuset","year":"2020","journal-title":"ECCV"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W18-6319"},{"issue":"140","key":"ref48","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"Journal of Machine Learning Research"},{"key":"ref49","article-title":"The CALBC silver standard corpus for biomedical named entities - a study in harmo-nizing the contributions from four independent named entity taggers","volume-title":"Proceedings of the Seventh International Con-ference on Language Resources and Evaluation (LREC\u201910)","author":"Rebholz-Schuhmann"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-32255-9_4"},{"key":"ref51","first-page":"958","article-title":"Computing EM-based alignments of routes and route directions as a basis for nat-ural language generation","volume-title":"Proceedings of the 23rd Inter-national Conference on Computational Linguistics (Coling 2010)","author":"Roth"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/p16-1162"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01075"},{"key":"ref55","author":"Sollami","year":"2021","journal-title":"Multimodal conditional-ity for natural language generation"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W16-2346"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1218"},{"key":"ref58","article-title":"Learning to nav-igate unseen environments: Back translation with environ-mental dropout","author":"Tan","year":"2019","journal-title":"NAACL"},{"key":"ref59","article-title":"Cider: Consensus-based image description evaluation","author":"Ramakrishna","year":"2015","journal-title":"CVPR"},{"key":"ref60","author":"Wang","year":"2021","journal-title":"Simvlm: Simple visual language model pretraining with weak supervision"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-emnlp.406"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.41"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1007\/s10339-021-01012-x"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.eacl-main.111"},{"key":"ref65","article-title":"Ob-jects as Points","volume-title":"Proceedings of CVPR","author":"Zhou","year":"2019"},{"key":"ref66","article-title":"More grounded image captioning by dis-tilling image-text matching model","author":"Zhou","year":"2021","journal-title":"CVPR"}],"event":{"name":"2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","location":"New Orleans, LA, USA","start":{"date-parts":[[2022,6,18]]},"end":{"date-parts":[[2022,6,24]]}},"container-title":["2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9878378\/9878366\/09878905.pdf?arnumber=9878905","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,24]],"date-time":"2024-01-24T02:22:05Z","timestamp":1706062925000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9878905\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,6]]},"references-count":66,"URL":"https:\/\/doi.org\/10.1109\/cvpr52688.2022.01499","relation":{},"subject":[],"published":{"date-parts":[[2022,6]]}}}