{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,19]],"date-time":"2026-06-19T17:33:17Z","timestamp":1781890397065,"version":"3.54.5"},"reference-count":37,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"1","license":[{"start":{"date-parts":[[2023,3,1]],"date-time":"2023-03-01T00:00:00Z","timestamp":1677628800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2023,3,1]],"date-time":"2023-03-01T00:00:00Z","timestamp":1677628800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,3,1]],"date-time":"2023-03-01T00:00:00Z","timestamp":1677628800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61773333"],"award-info":[{"award-number":["61773333"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China and the Royal Society of Britain","doi-asserted-by":"publisher","award":["62111530148"],"award-info":[{"award-number":["62111530148"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004543","name":"China Scholarship Council","doi-asserted-by":"publisher","award":["201908130016"],"award-info":[{"award-number":["201908130016"]}],"id":[{"id":"10.13039\/501100004543","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Cogn. Dev. Syst."],"published-print":{"date-parts":[[2023,3]]},"DOI":"10.1109\/tcds.2021.3139543","type":"journal-article","created":{"date-parts":[[2021,12,31]],"date-time":"2021-12-31T20:46:48Z","timestamp":1640983608000},"page":"3-15","source":"Crossref","is-referenced-by-count":12,"title":["Vision-and-Language Navigation Based on Cross-Modal Feature Fusion in Indoor Environment"],"prefix":"10.1109","volume":"15","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7646-4958","authenticated-orcid":false,"given":"Shuhuan","family":"Wen","sequence":"first","affiliation":[{"name":"Engineering Research Center of the Ministry of Education for Intelligent Control System and Intelligent Equipment and the Key Laboratory of Industrial Computer Control Engineering of Hebei Province, Yanshan University, Qinhuangdao, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiaohan","family":"Lv","sequence":"additional","affiliation":[{"name":"Engineering Research Center of the Ministry of Education for Intelligent Control System and Intelligent Equipment and the Key Laboratory of Industrial Computer Control Engineering of Hebei Province, Yanshan University, Qinhuangdao, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1006-7594","authenticated-orcid":false,"given":"F. Richard","family":"Yu","sequence":"additional","affiliation":[{"name":"Department of Systems and Computer Engineering, Carleton University, Ottawa, ON, Canada"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1449-1077","authenticated-orcid":false,"given":"Simeng","family":"Gong","sequence":"additional","affiliation":[{"name":"Engineering Research Center of the Ministry of Education for Intelligent Control System and Intelligent Equipment and the Key Laboratory of Industrial Computer Control Engineering of Hebei Province, Yanshan University, Qinhuangdao, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00387"},{"key":"ref2","first-page":"3314","article-title":"Speaker-follower models for vision-and-language navigation","volume-title":"Proc. Neural Inf. Process. Syst. (NeurIPS)","author":"Fried"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-1268"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1181"},{"key":"ref5","first-page":"1","article-title":"Self-monitoring navigation agent via auxiliary progress estimation","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Ma"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00689"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01281"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11832"},{"key":"ref9","first-page":"6000","article-title":"Attention is all you need","volume-title":"Proc. Conf. Neural Inf. Process. Syst. (NIPS)","author":"Vaswani"},{"key":"ref10","article-title":"VL-BERT: Pre-training of generic visual-linguistic representations","author":"Su","year":"2019","journal-title":"arXiv:1908.08530"},{"key":"ref11","first-page":"1928","article-title":"Asynchronous methods for deep reinforcement learning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Mnih"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1219"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6248"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.143"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.5555\/3045118.3045336"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2019.08.009"},{"key":"ref17","first-page":"289","article-title":"Hierarchical question-image co-attention for visual question answering","volume-title":"Proc. Conf. Nat. Inf. Process. Syst. (NIPS)","author":"Lu"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1514"},{"key":"ref19","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2018","journal-title":"arXiv:1810.04805"},{"key":"ref20","article-title":"VisualBERT: A simple and performant baseline for vision and language","author":"Li","year":"2019","journal-title":"arXiv:1908.03557"},{"key":"ref21","first-page":"13","article-title":"VilBERT: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","author":"Lu"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2018.XIV.067"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D17-1106"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01003"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00690"},{"key":"ref26","first-page":"1","article-title":"Learning to follow language instructions with adversarial reward induction","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Bahdanau"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01000"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2020.2965857"},{"key":"ref29","article-title":"IMPALA: Scalale distributed deep-RL with importance weighted actor-learner architectures","author":"Espeholt","year":"2018","journal-title":"arXiv:1802.01561"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00679"},{"key":"ref31","first-page":"1","article-title":"From language to goals: Inverse reinforcement learning for vision-based instruction following","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Fu"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58539-6_5"},{"key":"ref33","article-title":"A recurrent vision-and-language bert for navigation","author":"Hong","year":"2020","journal-title":"arXiv:2011.13922"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1287"},{"key":"ref35","article-title":"Grounded language learning in a simulated 3D world","author":"Hermann","year":"2017","journal-title":"arXiv:1706.06551"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2018.xiv.066"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2017.XIII.056"}],"container-title":["IEEE Transactions on Cognitive and Developmental Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/7274989\/10061504\/09667107.pdf?arnumber=9667107","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,13]],"date-time":"2024-01-13T22:08:07Z","timestamp":1705183687000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9667107\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,3]]},"references-count":37,"journal-issue":{"issue":"1"},"URL":"https:\/\/doi.org\/10.1109\/tcds.2021.3139543","relation":{},"ISSN":["2379-8920","2379-8939"],"issn-type":[{"value":"2379-8920","type":"print"},{"value":"2379-8939","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,3]]}}}