{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,4]],"date-time":"2026-06-04T19:10:12Z","timestamp":1780600212665,"version":"3.54.1"},"reference-count":52,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2022,3,23]],"date-time":"2022-03-23T00:00:00Z","timestamp":1647993600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,3,23]],"date-time":"2022-03-23T00:00:00Z","timestamp":1647993600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/100014717","name":"National Outstanding Youth Science Fund Project of National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100014717","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Neural Process Lett"],"published-print":{"date-parts":[[2022,10]]},"DOI":"10.1007\/s11063-022-10796-8","type":"journal-article","created":{"date-parts":[[2022,3,23]],"date-time":"2022-03-23T20:03:31Z","timestamp":1648065811000},"page":"3979-3998","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":32,"title":["Improving Target-driven Visual Navigation with Attention on 3D Spatial Relationships"],"prefix":"10.1007","volume":"54","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8873-8840","authenticated-orcid":false,"given":"Yunlian","family":"Lyu","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yimin","family":"Shi","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xianggang","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2022,3,23]]},"reference":[{"key":"10796_CR1","doi-asserted-by":"crossref","unstructured":"Savva M, Kadian A, Maksymets O, Zhao Y, Wijmans E, Jain B, Straub J, Liu J, Koltun V, Malik J et\u00a0al (2019) Habitat: a platform for embodied ai research. In: Proceedings of the IEEE international conference on computer vision (ICCV), pp 9339\u20139347","DOI":"10.1109\/ICCV.2019.00943"},{"key":"10796_CR2","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Mottaghi, R., Kolve, E., Lim, J.J., Gupta, A., Fei-Fei, L., Farhadi, A.: Target-driven visual navigation in indoor scenes using deep reinforcement learning. In: Proceedings - IEEE international conference on robotics and automation (ICRA), pp 3357\u20133364 (2017)","DOI":"10.1109\/ICRA.2017.7989381"},{"key":"10796_CR3","doi-asserted-by":"crossref","unstructured":"Anderson P, Wu Q, Teney D, Bruce J, Johnson M, S\u00fcnderhauf N, Reid I, Gould S, van\u00a0den Hengel A (2018) Vision-and-language navigation: interpreting visually-grounded navigation instructions in real environments. In: Proceedings of IEEE conference on computer vision and pattern recognition (CVPR), pp 3674\u20133683","DOI":"10.1109\/CVPR.2018.00387"},{"key":"10796_CR4","doi-asserted-by":"crossref","unstructured":"Das A, Datta S, Gkioxari G, Lee S, Parikh D, Batra D (2018) Embodied question answering. In: Proceedings of IEEE conference on computer vision and pattern recognition (CVPR), pp 2054\u20132063","DOI":"10.1109\/CVPR.2018.00008"},{"key":"10796_CR5","doi-asserted-by":"crossref","unstructured":"Deng J, Dong W, Socher R, Li L-J, Li K, Fei-Fei L (2009) Imagenet: a large-scale hierarchical image database. In: Proceedings of IEEE conference on computer vision and pattern recognition (CVPR), pp 248\u2013255","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"10796_CR6","doi-asserted-by":"crossref","unstructured":"Lin T-Y, Maire M, Belongie S, Hays J, Perona P, Ramanan D, Doll\u00e1r P, Zitnick CL (2014) Microsoft coco: Common objects in context. In: Proceedings of european conference on computer vision (ECCV), pp 740\u2013755","DOI":"10.1007\/978-3-319-10602-1_48"},{"issue":"1","key":"10796_CR7","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna R, Zhu Y, Groth O, Johnson J, Hata K, Kravitz J, Chen S, Kalantidis Y, Li L-J, Shamma DA et al (2017) Visual genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vis. 123(1):32\u201373","journal-title":"Int. J. Comput. Vis."},{"key":"10796_CR8","doi-asserted-by":"crossref","unstructured":"Chaplot DS, Sathyendra KM, Pasumarthi RK, Rajagopal D, Salakhutdinov R (2018) Gated-attention architectures for task-oriented language grounding. In: Proceedings of AAAI conference on artificial intelligence (AAAI), pp 2819\u20132826","DOI":"10.1609\/aaai.v32i1.11832"},{"key":"10796_CR9","unstructured":"Wu Y, Wu Y, Gkioxari G, Tian Y (2018) Building generalizable agents with a realistic and rich 3d environmentc"},{"key":"10796_CR10","unstructured":"Dhiman V, Banerjee S, Griffin B, Siskind JM, Corso JJ (2018) A critical investigation of deep reinforcement learning for navigation"},{"key":"10796_CR11","unstructured":"Kolve E, Mottaghi R, Han W, VanderBilt E, Weihs L, Herrasti A, Gordon D, Zhu Y, Gupta A, Farhadi A (2017) Ai2-thor: An interactive 3d environment for visual ai. arXiv preprint arXiv:1712.05474"},{"issue":"3","key":"10796_CR12","doi-asserted-by":"publisher","first-page":"263","DOI":"10.1007\/s10846-008-9235-4","volume":"53","author":"F Bonin-Font","year":"2008","unstructured":"Bonin-Font F, Ortiz A, Oliver G (2008) Visual navigation for mobile robots: a survey. J. Intell. Robotic Syst. 53(3):263\u2013296","journal-title":"J. Intell. Robotic Syst."},{"issue":"1","key":"10796_CR13","doi-asserted-by":"publisher","first-page":"55","DOI":"10.1007\/s10462-012-9365-8","volume":"43","author":"J Fuentes-Pacheco","year":"2015","unstructured":"Fuentes-Pacheco J, Ascencio JR, Rend\u00f3n-Mancha JM (2015) Visual simultaneous localization and mapping: a survey. Artif. Intell. Rev. 43(1):55\u201381","journal-title":"Artif. Intell. Rev."},{"issue":"7553","key":"10796_CR14","doi-asserted-by":"publisher","first-page":"436","DOI":"10.1038\/nature14539","volume":"521","author":"Y LeCun","year":"2015","unstructured":"LeCun Y, Bengio Y, Hinton G (2015) Deep learning. Nature 521(7553):436\u2013444","journal-title":"Nature"},{"key":"10796_CR15","volume-title":"Reinforcement Learning: An Introduction","author":"RS Sutton","year":"2018","unstructured":"Sutton RS, Barto AG (2018) Reinforcement Learning: An Introduction. MIT Press, Cambridge"},{"key":"10796_CR16","doi-asserted-by":"crossref","unstructured":"Mousavi SS, Schukat M, Howley E (2016) Deep reinforcement learning: an overview. In: Proceedings of the SAI intelligent systems conference, pp. 426\u2013440. Springer","DOI":"10.1007\/978-3-319-56991-8_32"},{"key":"10796_CR17","doi-asserted-by":"crossref","unstructured":"Zhu Y, Gordon D, Kolve E, Fox D, Fei-Fei L, Gupta A, Mottaghi R, Farhadi A (2017) Visual semantic planning using deep successor representations. In: Proceedings of the IEEE international conference on computer vision (ICCV), pp 483\u2013492","DOI":"10.1109\/ICCV.2017.60"},{"key":"10796_CR18","unstructured":"Yang W, Wang X, Farhadi A, Gupta A, Mottaghi R (2019) Visual semantic navigation using scene priors. In: Proceedings of international conference on learning representations (ICLR)"},{"key":"10796_CR19","doi-asserted-by":"crossref","unstructured":"Mei H, Bansal M, Walter MR (2016) Listen, attend, and walk: neural mapping of navigational instructions to action sequences. In: Proceedings of AAAI conference on artificial intelligence (AAAI), pp 2772\u20132778","DOI":"10.1609\/aaai.v30i1.10364"},{"key":"10796_CR20","unstructured":"Fried D, Hu R, Cirik V, Rohrbach A, Andreas J, Morency L-P, Berg-Kirkpatrick T, Saenko K, Klein D, Darrell T (2018) Speaker-follower models for vision-and-language navigation. In: Proceedings of the neural information processing systems (NIPS), pp 3314\u20133325"},{"key":"10796_CR21","doi-asserted-by":"crossref","unstructured":"Gordon D, Kembhavi A, Rastegari M, Redmon J, Fox D, Farhadi A (2018) Iqa: visual question answering in interactive environments. In: Proceedings of the IEEE computer society conference on computer vision and pattern recognition (CVPR), pp 4089\u20134098","DOI":"10.1109\/CVPR.2018.00430"},{"issue":"5","key":"10796_CR22","doi-asserted-by":"publisher","first-page":"1546","DOI":"10.1109\/TRO.2020.2994002","volume":"36","author":"A Devo","year":"2020","unstructured":"Devo A, Mezzetti G, Costante G, Fravolini ML, Valigi P (2020) Towards generalization in target-driven visual navigation by using deep reinforcement learning. IEEE Trans. Robot. 36(5):1546\u20131561","journal-title":"IEEE Trans. Robot."},{"issue":"12","key":"10796_CR23","doi-asserted-by":"publisher","first-page":"5445","DOI":"10.1109\/TNNLS.2021.3057424","volume":"32","author":"Z Rao","year":"2021","unstructured":"Rao Z, Wu Y, Yang Z, Zhang W, Lu S, Lu W, Zha Z (2021) Visual navigation with multiple goals based on deep reinforcement learning. IEEE Trans Neural Netw Learn Syst 32(12):5445\u20135455","journal-title":"IEEE Trans Neural Netw Learn Syst"},{"key":"10796_CR24","unstructured":"Kulkarni TD, Saeedi A, Gautam S, Gershman SJ (2016) Deep successor reinforcement learning. arXiv preprint arXiv:1606.02396"},{"key":"10796_CR25","doi-asserted-by":"crossref","unstructured":"Tessler C, Givony S, Zahavy T, Mankowitz DJ, Mannor S (2017) A deep hierarchical approach to lifelong learning in minecraft. In: Proceedings of the AAAI conference on artificial intelligence (AAAI), pp 1553\u20131561","DOI":"10.1609\/aaai.v31i1.10744"},{"key":"10796_CR26","unstructured":"Mirowski P, Pascanu R, Viola F, Soyer H, Ballard AJ, Banino A, Denil M, Goroshin R, Sifre L, Kavukcuoglu K, et\u00a0al (2017) Learning to navigate in complex environments. In: Proceedings of the international conference on learning representations (ICLR)"},{"key":"10796_CR27","unstructured":"Jaderberg M, Mnih V, Czarnecki WM, Schaul T, Leibo JZ, Silver D, Kavukcuoglu K (2017) Reinforcement learning with unsupervised auxiliary tasks. In: Proceedings of international conference on learning representations (ICLR)"},{"key":"10796_CR28","doi-asserted-by":"crossref","unstructured":"Kempka M, Wydmuch M, Runc G, Toczek J, Ja\u015bkowski W (2016) Vizdoom: a doom-based ai research platform for visual reinforcement learning. In: Proceedings of IEEE international conference on computational intelligence and games (CIG), pp. 1\u20138","DOI":"10.1109\/CIG.2016.7860433"},{"key":"10796_CR29","unstructured":"Oh J, Chockalingam V, Lee H, et\u00a0al (2016) Control of memory, active perception, and action in minecraft. In: Proceedings of the international conference on machine learning (ICML), pp 2790\u20132799"},{"key":"10796_CR30","unstructured":"Beattie C, Leibo JZ, Teplyashin D, Ward T, Wainwright M, K\u00fcttler H, Lefrancq A, Green S, Vald\u00e9s V, Sadik A, et al (2016) Deepmind lab"},{"key":"10796_CR31","unstructured":"Chaplot DS, Lample G, Sathyendra KM, Salakhutdinov R (2016) Transfer deep reinforcement learning in 3d environments: An empirical study. In: Proceedings of the international conference on neural information processing systems (NIPS)"},{"key":"10796_CR32","unstructured":"Parisotto E, Salakhutdinov R (2018) Neural map: structured memory for deep reinforcement learning. In: Proceedings of international conference on learning representations (ICLR)"},{"key":"10796_CR33","unstructured":"Oh J, Singh S, Lee H, Kohli P (2017) Zero-shot task generalization with multi-task deep reinforcement learning. In: Proceedings of the international conference on machine learning (ICML), pp 2661\u20132670"},{"key":"10796_CR34","doi-asserted-by":"crossref","unstructured":"Pathak D, Mahmoudieh P, Luo G, Agrawal P, Chen D, Shentu Y, Shelhamer E, Malik J, Efros AA, Darrell T (2018) Zero-shot visual imitation. In: Proceedings of the IEEE computer society conference on computer vision and pattern recognition (CVPR), pp 2050\u20132053","DOI":"10.1109\/CVPRW.2018.00278"},{"key":"10796_CR35","doi-asserted-by":"crossref","unstructured":"Wortsman M, Ehsani K, Rastegari M, Farhadi A, Mottaghi R (2019) Learning to learn how to learn: Self-adaptive visual navigation using meta-learning. In: Proceedings of the IEEE computer society conference on computer vision and pattern recognition (CVPR), pp 6750\u20136759","DOI":"10.1109\/CVPR.2019.00691"},{"key":"10796_CR36","doi-asserted-by":"crossref","unstructured":"Fang Q, Xu X, Wang X, Zeng Y (2021) Target-driven visual navigation in indoor scenes using reinforcement learning and imitation learning. CAAI T. Intell, Technol","DOI":"10.1117\/12.2581306"},{"key":"10796_CR37","doi-asserted-by":"crossref","unstructured":"Song S, Yu F, Zeng A, Chang AX, Savva M, Funkhouser T (2017) Semantic scene completion from a single depth image. In: Proceedings of the IEEE computer society conference on computer vision and pattern recognition (CVPR), pp 1746\u20131754","DOI":"10.1109\/CVPR.2017.28"},{"key":"10796_CR38","doi-asserted-by":"crossref","unstructured":"Chang A, Dai A, Funkhouser TA, Halber M, Niebner M, Savva M, Song S, Zeng A, Zhang Y (2018) Matterport3d: Learning from rgb-d data in indoor environments. In: Proceedings of international conference on 3D vision (3DV), pp 667\u2013676","DOI":"10.1109\/3DV.2017.00081"},{"key":"10796_CR39","unstructured":"Kipf TN, Welling M (2017) Semi-supervised classification with graph convolutional networks. In: Proceedings of international conference on learning representations (ICLR)"},{"key":"10796_CR40","unstructured":"Mnih V, Badia AP, Mirza M, Graves A, Lillicrap T, Harley T, Silver D, Kavukcuoglu K (2016) Asynchronous methods for deep reinforcement learning. In: Proceedings of international conference on machine learning (ICML), pp 1928\u20131937"},{"issue":"8","key":"10796_CR41","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter S, Schmidhuber J (1997) Long short-term memory. Neural Comput. 9(8):1735\u20131780","journal-title":"Neural Comput."},{"key":"10796_CR42","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE computer society conference on computer vision and pattern recognition (CVPR), pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"10796_CR43","doi-asserted-by":"crossref","unstructured":"Chopra S, Hadsell R, LeCun Y, et\u00a0al (2005) Learning a similarity metric discriminatively, with application to face verification. In: Proceedings of the IEEE computer society conference on computer vision and pattern recognition (CVPR), pp 539\u2013546","DOI":"10.1109\/CVPR.2005.202"},{"key":"10796_CR44","unstructured":"Redmon J, Farhadi A (2018) Yolov3: an incremental improvement"},{"key":"10796_CR45","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser \u0141, Polosukhin I (2017) Attention is all you need. In: Proceedings of the neural information processing systems (NIPS), pp 5998\u20136008"},{"issue":"2","key":"10796_CR46","doi-asserted-by":"publisher","first-page":"1072","DOI":"10.1109\/JIOT.2019.2949715","volume":"7","author":"H Zhang","year":"2019","unstructured":"Zhang H, Xiao Z, Wang J, Li F, Szczerbicki E (2019) A novel iot-perceptive human activity recognition (har) approach using multihead convolutional attention. IEEE Internet Things J. 7(2):1072\u20131080","journal-title":"IEEE Internet Things J."},{"key":"10796_CR47","doi-asserted-by":"publisher","first-page":"65","DOI":"10.1016\/j.ins.2021.04.053","volume":"571","author":"Z Xiao","year":"2021","unstructured":"Xiao Z, Xu X, Xing H, Luo S, Dai P, Zhan D (2021) Rtfn: a robust temporal feature network for time series classification. Inf. Sci. 571:65\u201386","journal-title":"Inf. Sci."},{"key":"10796_CR48","unstructured":"Andrychowicz M, Wolski F, Ray A, Schneider J, Fong R, Welinder P, McGrew B, Tobin J, Abbeel OP, Zaremba W (2017) Hindsight experience replay. In: Proceedings of the neural information processing systems (NIPS), pp 5048\u20135058"},{"key":"10796_CR49","unstructured":"Anderson P, Chang A, Chaplot DS, Dosovitskiy A, Gupta S, Koltun V, Kosecka J, Malik J, Mottaghi R, Savva M, et al (2018) On evaluation of embodied navigation agents"},{"key":"10796_CR50","unstructured":"Bojarski M, Del\u00a0Testa D, Dworakowski D, Firner B, Flepp B, Goyal P, Jackel LD, Monfort M, Muller U, Zhang J, et al (2016) End to end learning for self-driving cars"},{"key":"10796_CR51","unstructured":"Abadi M, Agarwal A, Barham P, Brevdo E, Chen Z, Citro C, Corrado GS, Davis A, Dean J, Devin M, et al (2016) Tensorflow: large-scale machine learning on heterogeneous distributed systems"},{"issue":"Nov","key":"10796_CR52","first-page":"2579","volume":"9","author":"Maaten Lvd","year":"2008","unstructured":"Lvd Maaten, Hinton G (2008) Visualizing data using t-sne. J. Mach. Learn. Res. 9(Nov):2579\u20132605","journal-title":"J. Mach. Learn. Res."}],"container-title":["Neural Processing Letters"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11063-022-10796-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11063-022-10796-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11063-022-10796-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,14]],"date-time":"2022-10-14T07:14:50Z","timestamp":1665731690000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11063-022-10796-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,3,23]]},"references-count":52,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2022,10]]}},"alternative-id":["10796"],"URL":"https:\/\/doi.org\/10.1007\/s11063-022-10796-8","relation":{},"ISSN":["1370-4621","1573-773X"],"issn-type":[{"value":"1370-4621","type":"print"},{"value":"1573-773X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,3,23]]},"assertion":[{"value":"3 March 2022","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 March 2022","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}