{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,11]],"date-time":"2026-06-11T16:20:29Z","timestamp":1781194829720,"version":"3.54.1"},"reference-count":37,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018,5]]},"DOI":"10.1109\/icra.2018.8460857","type":"proceedings-article","created":{"date-parts":[[2018,9,21]],"date-time":"2018-09-21T22:28:03Z","timestamp":1537568883000},"page":"3782-3788","source":"Crossref","is-referenced-by-count":53,"title":["Translating Videos to Commands for Robotic Manipulation with Deep Recurrent Neural Networks"],"prefix":"10.1109","author":[{"given":"Anh","family":"Nguyen","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Dimitrios","family":"Kanoulas","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Luca","family":"Muratore","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Darwin G.","family":"Caldwell","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Nikos G.","family":"Tsagarakis","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1002\/rob.21702"},{"key":"ref32","article-title":"The language of actions: Recovering the syntax and semantics of goal-directed human activities","author":"kuehne","year":"2014","journal-title":"Conference on Computer Vision and Pattern Recognition (CVPR)"},{"key":"ref31","article-title":"Learning convolutional action primitives for fine-grained action recognition","author":"lea","year":"2016","journal-title":"Int Conf Robotics and Automation (ICRA)"},{"key":"ref30","article-title":"Msr-vtt: A large video description dataset for bridging video and language","author":"xu","year":"2016","journal-title":"Conference on Computer Vision and Pattern Recognition (CVPR)"},{"key":"ref37","article-title":"Object-Based Affordances Detection with Convolutional Neural Networks and Dense Conditional Random Fields","author":"nguyen","year":"2017","journal-title":"International Conference on Intelligent Robots and Systems (IROS)"},{"key":"ref36","article-title":"Affordancenet: An end-to-end deep learning approach for object affordance detection","author":"do","year":"2018","journal-title":"Int Conf Robotics and Automation (ICRA)"},{"key":"ref35","article-title":"OpenSoT: A Whole-Body Control Library for the Compliant Humanoid Robot COMAN","author":"rocchi","year":"2015","journal-title":"International Conference on Robotics and Automation (ICRA)"},{"key":"ref34","article-title":"Xbotcore: A real-time cross-robot software platform","author":"muratore","year":"2017","journal-title":"International conference on Robotic Computing"},{"key":"ref10","article-title":"Semantic decomposition and recognition of long and complex manipulation action sequences","author":"aksoy","year":"2016","journal-title":"International Journal of Computer Vision"},{"key":"ref11","article-title":"Robot learning manipulation action plans by &#x201C;watching&#x201D; unconstrained videos from the world wide web","author":"yang","year":"2015","journal-title":"AAAI Conference on Artificial Intelligence"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2017.2669363"},{"key":"ref13","article-title":"Understanding natural language commands for robotic navigation and mobile manipulation","author":"tellex","year":"2011","journal-title":"AAAI Conference on Artificial Intelligence"},{"key":"ref14","article-title":"Grounding spatial relations for human-robot interaction","author":"guadarrama","year":"2013","journal-title":"International Conference on Intelligent Robots and Systems (IROS)"},{"key":"ref15","article-title":"Long-term recurrent convolutional networks for visual recognition and description","author":"donahue","year":"2014","journal-title":"Computer Vision and Pattern Recognition (CVPR)"},{"key":"ref16","article-title":"Video paragraph captioning using hierarchical recurrent neural networks","author":"yu","year":"2015","journal-title":"Conference on Computer Vision and Pattern Recognition (CVPR)"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref18","article-title":"Learning phrase representations using RNN encoder-decoder for statistical machine translation","author":"cho","year":"2014","journal-title":"arXiv 1406 1078"},{"key":"ref19","author":"nguyen","year":"2017","journal-title":"Real-time pose estimation for event cameras with stacked spatial lstm networks"},{"key":"ref28","article-title":"Sequence to sequence learning with neural networks","author":"sutskever","year":"2014","journal-title":"Advances in Neural Information Processing Systems (NIPS)"},{"key":"ref4","article-title":"Trajectories and keyframes for kinesthetic teaching: A human-robot interaction perspective","author":"akgun","year":"2012","journal-title":"International Conference on Human-Robot Interaction (HRI)"},{"key":"ref27","author":"venugopalan","year":"2014","journal-title":"Translating videos to natural language using deep recurrent neural networks"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1016\/j.robot.2008.10.024"},{"key":"ref6","article-title":"Large-scale video classification with convolutional neural networks","author":"karpathy","year":"2014","journal-title":"Conference on Computer Vision and Pattern Recognition (CVPR)"},{"key":"ref29","article-title":"Adam: A method for stochastic optimization","author":"kingma","year":"2014","journal-title":"International Conference on Learning Representations"},{"key":"ref5","article-title":"Real-time imitation of human whole-body motions by humanoids","author":"koenemann","year":"2014","journal-title":"International Conference on Robotics and Automation (ICRA)"},{"key":"ref8","author":"venugopalan","year":"2016","journal-title":"Sequence to sequence-video to text"},{"key":"ref7","article-title":"Two-stream convolutional networks for action recognition in videos","author":"simonyan","year":"2014","journal-title":"Conference on Neural Information Processing Systems (NIPS)"},{"key":"ref2","article-title":"Transferring skills to humanoid robots by extracting semantic representations from observations of human activities","author":"ramirez-amaro","year":"2015","journal-title":"Artificial Intelligence"},{"key":"ref9","article-title":"Learning manipulation actions from human demonstrations","author":"welschehold","year":"2016","journal-title":"International Conference on Intelligent Robots and Systems (IROS)"},{"key":"ref1","author":"chrystopher","year":"2009","journal-title":"Imitation and Social Learning in Robots Humans and Animals Behavioural Social and Communicative Dimensions"},{"key":"ref20","article-title":"Top-down visual saliency guided by captions","author":"ramanishka","year":"2017","journal-title":"Conference on Computer Vision and Pattern Recognition (CVPR)"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"ref21","article-title":"Distributed representations of words and phrases and their compo-sitionality","author":"mikolov","year":"2013","journal-title":"Advances in Neural Information Processing Systems (NIPS)"},{"key":"ref24","article-title":"Rethinking the inception architecture for computer vision","author":"szegedy","year":"2016","journal-title":"Conference on Computer Vision and Pattern Recognition (CVPR)"},{"key":"ref23","article-title":"Very Deep Convolutional Networks for Large-Scale Image Recognition","volume":"abs 1409 1556","author":"simonyan","year":"2014","journal-title":"CoRR"},{"key":"ref26","article-title":"TensorFlow: Large-scale machine learning on heterogeneous systems","author":"abadi","year":"2015","journal-title":"software available from tensorfloworg"},{"key":"ref25","article-title":"Deep residual learning for image recognition","author":"he","year":"2016","journal-title":"Conference on Computer Vision and Pattern Recognition (CVPR)"}],"event":{"name":"2018 IEEE International Conference on Robotics and Automation (ICRA)","location":"Brisbane, QLD","start":{"date-parts":[[2018,5,21]]},"end":{"date-parts":[[2018,5,25]]}},"container-title":["2018 IEEE International Conference on Robotics and Automation (ICRA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/8449910\/8460178\/08460857.pdf?arnumber=8460857","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,8,23]],"date-time":"2020-08-23T23:05:42Z","timestamp":1598223942000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8460857\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,5]]},"references-count":37,"URL":"https:\/\/doi.org\/10.1109\/icra.2018.8460857","relation":{},"subject":[],"published":{"date-parts":[[2018,5]]}}}