{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T16:49:21Z","timestamp":1776098961433,"version":"3.50.1"},"reference-count":28,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018,5]]},"DOI":"10.1109\/icra.2018.8460699","type":"proceedings-article","created":{"date-parts":[[2018,9,21]],"date-time":"2018-09-21T22:28:03Z","timestamp":1537568883000},"page":"3774-3781","source":"Crossref","is-referenced-by-count":121,"title":["Interactively Picking Real-World Objects with Unconstrained Spoken Language Instructions"],"prefix":"10.1109","author":[{"given":"Jun","family":"Hatori","sequence":"first","affiliation":[]},{"given":"Yuta","family":"Kikuchi","sequence":"additional","affiliation":[]},{"given":"Sosuke","family":"Kobayashi","sequence":"additional","affiliation":[]},{"given":"Kuniyuki","family":"Takahashi","sequence":"additional","affiliation":[]},{"given":"Yuta","family":"Tsuboi","sequence":"additional","affiliation":[]},{"given":"Yuya","family":"Unno","sequence":"additional","affiliation":[]},{"given":"Wilson","family":"Ko","sequence":"additional","affiliation":[]},{"given":"Jethro","family":"Tan","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-27146-0_14"},{"key":"ref11","first-page":"595","article-title":"Multimodal neural language models","volume":"32","author":"kiros","year":"2014","journal-title":"International Conference on Machine Learning ser Proceedings of Machine Learning Research"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"ref13","article-title":"Devise: A deep visual-semantic embedding model","author":"frome","year":"2013","journal-title":"Neural Information Processing Systems"},{"key":"ref14","article-title":"Unifying visual-semantic embeddings with multimodal neural language models","author":"kiros","year":"2014","journal-title":"arXiv 1411 2539"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0981-7"},{"key":"ref16","article-title":"Densecap: Fully convolutional localization networks for dense captioning","author":"johnson","year":"2016","journal-title":"IEEE Conference on Computer Vision and Pattern Recognition"},{"key":"ref17","article-title":"Visual relationship detection with language priors","author":"lu","year":"2016","journal-title":"European Conference on Computer Vision"},{"key":"ref18","article-title":"Referit game: Referring to objects in photographs of natural scenes","author":"kazemzadeh","year":"2014","journal-title":"Conference on Empirical Methods in Natural Language Processing"},{"key":"ref19","article-title":"Generation and comprehension of unambiguous object descriptions","author":"mao","year":"2016","journal-title":"IEEE Conf Computer Vision and Pattern Recognition"},{"key":"ref28","article-title":"Deep residual learning for image recognition","author":"he","year":"2016","journal-title":"IEEE Conference on Computer Vision and Pattern Recognition"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2012.28"},{"key":"ref27","article-title":"ImageNet: A Large-Scale Hierarchical Image Database","author":"deng","year":"2009","journal-title":"Computer Vision and Pattern Recognition"},{"key":"ref3","article-title":"Ssd: Single shot multibox detector","author":"liu","year":"2016","journal-title":"European Conference on Computer Vision"},{"key":"ref6","article-title":"A joint speaker-listener-reinforcer model for referring expressions","author":"licheng","year":"2017","journal-title":"IEEE Conference on Computer Vision and Pattern Recognition"},{"key":"ref5","article-title":"Modeling context in referring expressions","author":"yu","year":"2016","journal-title":"European Conference on Computer Vision"},{"key":"ref8","author":"shridhar","year":"2017","journal-title":"Grounding spatio-semantic referring expressions for human-robot interaction"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2016.XII.037"},{"key":"ref2","first-page":"1","article-title":"Probabilistic detection of pointing directions for human-robot interaction","author":"shukla","year":"2015","journal-title":"International Conference on Digital Image Computing Techniques and Applications"},{"key":"ref9","first-page":"1006","article-title":"Reducing errors in object-fetching interactions through social feedback","author":"whitney","year":"2017","journal-title":"IEEE International Conference on Robotics and Automation"},{"key":"ref1","first-page":"3272","article-title":"Remote control system for multiple mobile robots using touch panel interface and autonomous mobility","author":"ochiai","year":"2014","journal-title":"IEEE International Conference on Intelligent Robots and Systems"},{"key":"ref20","first-page":"792","article-title":"Modeling context between objects for referring expression understanding","author":"nagaraja","year":"2016","journal-title":"European Conference on Computer Vision"},{"key":"ref22","article-title":"Faster r-cnn: Towards real-time object detection with region proposal networks","author":"ren","year":"2015","journal-title":"Advances in neural information processing systems"},{"key":"ref21","article-title":"Vqa: Visual question answering","author":"antol","year":"2015","journal-title":"IEEE International Conference on Computer Vision"},{"key":"ref24","article-title":"Microsoft COCO: Common objects in context","author":"lin","year":"2014","journal-title":"European Conference on Computer Vision"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-009-0275-4"},{"key":"ref26","article-title":"ChainerCV: a library for deep learning in computer vision","author":"niitani","year":"2017","journal-title":"Proc ACM Workshops on Multimedia"},{"key":"ref25","article-title":"Chainer: a next-generation open source framework for deep learning","author":"tokui","year":"2015","journal-title":"Workshop on machine learning systems on Neural Information Processing Systems"}],"event":{"name":"2018 IEEE International Conference on Robotics and Automation (ICRA)","location":"Brisbane, QLD","start":{"date-parts":[[2018,5,21]]},"end":{"date-parts":[[2018,5,25]]}},"container-title":["2018 IEEE International Conference on Robotics and Automation (ICRA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/8449910\/8460178\/08460699.pdf?arnumber=8460699","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,8,24]],"date-time":"2020-08-24T02:57:45Z","timestamp":1598237865000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8460699\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,5]]},"references-count":28,"URL":"https:\/\/doi.org\/10.1109\/icra.2018.8460699","relation":{},"subject":[],"published":{"date-parts":[[2018,5]]}}}