{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,21]],"date-time":"2025-12-21T06:24:54Z","timestamp":1766298294363,"version":"3.37.3"},"reference-count":45,"publisher":"Informa UK Limited","issue":"18","funder":[{"DOI":"10.13039\/501100001691","name":"JSPS KAKENHI","doi-asserted-by":"publisher","award":["23K03478"],"award-info":[{"award-number":["23K03478"]}],"id":[{"id":"10.13039\/501100001691","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100020963","name":"JST Moonshot","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100020963","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003382","name":"JST CREST","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100003382","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003051","name":"NEDO","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100003051","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["www.tandfonline.com"],"crossmark-restriction":true},"short-container-title":["Advanced Robotics"],"published-print":{"date-parts":[[2024,9,16]]},"DOI":"10.1080\/01691864.2024.2388114","type":"journal-article","created":{"date-parts":[[2024,8,9]],"date-time":"2024-08-09T20:14:57Z","timestamp":1723234497000},"page":"1265-1276","update-policy":"https:\/\/doi.org\/10.1080\/tandf_crossmark_01","source":"Crossref","is-referenced-by-count":2,"title":["Nearest neighbor future captioning: generating descriptions for possible collisions in object placement tasks"],"prefix":"10.1080","volume":"38","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-7372-8456","authenticated-orcid":false,"given":"Takumi","family":"Komatsu","sequence":"first","affiliation":[{"name":"Faculty of Science and Technology, Keio University, Yokohama, Kanagawa, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1991-9119","authenticated-orcid":false,"given":"Motonari","family":"Kambara","sequence":"additional","affiliation":[{"name":"Faculty of Science and Technology, Keio University, Yokohama, Kanagawa, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-6338-3098","authenticated-orcid":false,"given":"Shumpei","family":"Hatanaka","sequence":"additional","affiliation":[{"name":"Faculty of Science and Technology, Keio University, Yokohama, Kanagawa, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-5617-507X","authenticated-orcid":false,"given":"Haruka","family":"Matsuo","sequence":"additional","affiliation":[{"name":"Faculty of Science and Technology, Keio University, Yokohama, Kanagawa, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3851-5221","authenticated-orcid":false,"given":"Tsubasa","family":"Hirakawa","sequence":"additional","affiliation":[{"name":"Faculty of Science and Technology, Chubu University, Kasugai, Aichi, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2631-9856","authenticated-orcid":false,"given":"Takayoshi","family":"Yamashita","sequence":"additional","affiliation":[{"name":"Faculty of Science and Technology, Chubu University, Kasugai, Aichi, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7391-4725","authenticated-orcid":false,"given":"Hironobu","family":"Fujiyoshi","sequence":"additional","affiliation":[{"name":"Faculty of Science and Technology, Chubu University, Kasugai, Aichi, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0261-0510","authenticated-orcid":false,"given":"Komei","family":"Sugiura","sequence":"additional","affiliation":[{"name":"Faculty of Science and Technology, Keio University, Yokohama, Kanagawa, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"301","published-online":{"date-parts":[[2024,8,9]]},"reference":[{"key":"e_1_3_3_2_1","doi-asserted-by":"publisher","DOI":"10.1186\/s40648-019-0129-y"},{"key":"e_1_3_3_3_1","unstructured":"Yi K Gan C Li Y et al. CLEVRER: collision events for video representation and reasoning. In: ICLR; New Orleans USA; 2019."},{"key":"e_1_3_3_4_1","doi-asserted-by":"crossref","unstructured":"Kambara M Sugiura K. Relational future captioning model for explaining likely collisions in daily tasks. In: ICIP; Bordeaux France; 2022. p. 2601\u20132605.","DOI":"10.1109\/ICIP46576.2022.9897231"},{"key":"e_1_3_3_5_1","unstructured":"Chen Z Yi K Li Y et\u00a0al. COMPHY: compositional physical reasoning of objects and events from videos. In: ICLR; 2022."},{"key":"e_1_3_3_6_1","unstructured":"Khandelwal U Levy O Jurafsky D et al. Generalization through memorization: nearest neighbor language models. In: ICLR; New Orleans USA; 2019."},{"key":"e_1_3_3_7_1","unstructured":"Khandelwal U Fan A Jurafsky D et\u00a0al. Nearest neighbor machine translation. In: ICLR; 2020."},{"key":"e_1_3_3_8_1","unstructured":"Xu K Ba J Kiros R et al. Show attend and tell: neural image caption generation with visual attention. In: ICML; Lille France; 2015. p. 2048\u20132057."},{"key":"e_1_3_3_9_1","doi-asserted-by":"crossref","unstructured":"Krishna R Hata K Ren F et al. Dense-captioning events in videos. In: ICCV; Venice Italy; 2017. p. 706\u2013715.","DOI":"10.1109\/ICCV.2017.83"},{"key":"e_1_3_3_10_1","doi-asserted-by":"crossref","unstructured":"Wang X Chen W Wu J et al. Video captioning via hierarchical reinforcement learning. In: CVPR; Utah USA; 2018. p. 4213\u20134222.","DOI":"10.1109\/CVPR.2018.00443"},{"key":"e_1_3_3_11_1","doi-asserted-by":"crossref","unstructured":"Lei J Wang L Shen Y et\u00a0al. Mart: memory-augmented recurrent transformer for coherent video paragraph captioning. In: ACL; 2020. p. 2603\u20132614.","DOI":"10.18653\/v1\/2020.acl-main.233"},{"key":"e_1_3_3_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3148210"},{"key":"e_1_3_3_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3355390"},{"key":"e_1_3_3_14_1","doi-asserted-by":"crossref","unstructured":"Hosseinzadeh M Wang Y. Video captioning of future frames. In: WACV; Hawai USA; 2021. p. 980\u2013989.","DOI":"10.1109\/WACV48630.2021.00102"},{"key":"e_1_3_3_15_1","doi-asserted-by":"crossref","unstructured":"Mori Y Hirakawa T Yamashita T et\u00a0al. Image captioning in near future from vehicle camera images and motion information. In: IEEE IV; 2021. p. 1378\u20131384.","DOI":"10.1109\/IV48863.2021.9575562"},{"key":"e_1_3_3_16_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2021.103230"},{"key":"e_1_3_3_17_1","doi-asserted-by":"crossref","unstructured":"Deng C Chen S Chen D et\u00a0al. Sketch ground and refine: top-down dense video captioning. In: CVPR; 2021. p. 234\u2013243.","DOI":"10.1109\/CVPR46437.2021.00030"},{"key":"e_1_3_3_18_1","doi-asserted-by":"crossref","unstructured":"Zhang Z Qi Z Yuan C et\u00a0al. Open-book video captioning with retrieve-copy-generate network. In: CVPR; 2021. p. 9837\u20139846.","DOI":"10.1109\/CVPR46437.2021.00971"},{"key":"e_1_3_3_19_1","first-page":"6634","article-title":"Multi-Modal dependency tree for video captioning","volume":"34","author":"Zhao W","year":"2021","unstructured":"Zhao W, Wu X, Luo J. Multi-Modal dependency tree for video captioning. Adv Neural Inf Process Syst. 2021;34:6634\u20136645.","journal-title":"Adv Neural Inf Process Syst"},{"key":"e_1_3_3_20_1","doi-asserted-by":"crossref","unstructured":"Sun C Myers A Vondrick C et al. Videobert: a joint model for video and language representation learning. In: ICCV; Seoul Korea; 2019. p. 7464\u20137473.","DOI":"10.1109\/ICCV.2019.00756"},{"key":"e_1_3_3_21_1","unstructured":"Luo H Ji L Shi B et\u00a0al. Univl: a unified video and language pre-training model for multimodal understanding and generation. preprint 2020. arXiv:200206353."},{"key":"e_1_3_3_22_1","doi-asserted-by":"crossref","unstructured":"Li L Chen YC Cheng Y et\u00a0al. HERO: hierarchical encoder for video+ language omni-representation pre-training. In: EMNLP; 2020. p. 2046\u20132065.","DOI":"10.18653\/v1\/2020.emnlp-main.161"},{"key":"e_1_3_3_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2020.3010735"},{"key":"e_1_3_3_24_1","unstructured":"Magassouba A Sugiura K Kawai H. Multimodal attention branch network for perspective-free sentence generation. In: CORL; 2020. p. 76\u201385."},{"key":"e_1_3_3_25_1","doi-asserted-by":"crossref","unstructured":"Fukui H Hirakawa T Yamashita T et al. Attention branch network: learning of attention mechanism for visual explanation. In: CVPR; California USA; 2019. p. 10705\u201310714.","DOI":"10.1109\/CVPR.2019.01096"},{"key":"e_1_3_3_26_1","doi-asserted-by":"crossref","unstructured":"Plummer BA Wang L Cervantes CM et al. Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. In: ICCV; Santiago Chile; 2015. p. 2641\u20132649.","DOI":"10.1109\/ICCV.2015.303"},{"key":"e_1_3_3_27_1","doi-asserted-by":"crossref","unstructured":"Lin TY Maire M Belongie S et al. Microsoft COCO: common objects in context. In: ECCV; Zurich Switzerland; 2014. p. 740\u2013755.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_3_28_1","doi-asserted-by":"crossref","unstructured":"Zhou L Xu C Corso JJ. Towards automatic learning of procedures from web instructional videos. In: AAAI; New Orleans USA; 2018. p. 7590\u20137598.","DOI":"10.1609\/aaai.v32i1.12342"},{"key":"e_1_3_3_29_1","doi-asserted-by":"crossref","unstructured":"Heilbron FC Escorcia V Ghanem B et al. Activitynet: a large-scale video benchmark for human activity understanding. In: CVPR; Boston USA; 2015. p. 961\u2013970.","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"e_1_3_3_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/TRO.2017.2723903"},{"key":"e_1_3_3_31_1","unstructured":"Gan C Schwartz J Alter S et\u00a0al. ThreeDWorld: a platform for interactive multi-modal physical simulation. In: NeurIPS D&B; 2021."},{"key":"e_1_3_3_32_1","doi-asserted-by":"crossref","unstructured":"Haddadin S Albu-Schaffer A De Luca A et\u00a0al. Collision detection and reaction: A contribution to safe physical human-robot interaction. In: IROS; IEEE; 2008. p. 3356\u20133363.","DOI":"10.1109\/IROS.2008.4650764"},{"key":"e_1_3_3_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2019.2893400"},{"key":"e_1_3_3_34_1","doi-asserted-by":"crossref","unstructured":"Mottaghi R Rastegari M Gupta A et al. \u2018What happens if\u00b7\u00b7\u00b7\u2009\u2019 learning to predict the effect of forces in images. In: ECCV; 2016. p. 269\u2013285.\u00a0Amsterdam The Netherlands.","DOI":"10.1007\/978-3-319-46493-0_17"},{"key":"e_1_3_3_35_1","doi-asserted-by":"publisher","DOI":"10.1080\/01691864.2021.1913446"},{"key":"e_1_3_3_36_1","doi-asserted-by":"crossref","unstructured":"Papineni K Roukos S Ward T et al. BLEU: a method for automatic evaluation of machine translation. In: ACL; Philadelphia USA; 2002. p. 311\u2013318.","DOI":"10.3115\/1073083.1073135"},{"key":"e_1_3_3_37_1","unstructured":"Banerjee S Lavie A. METEOR: an automatic metric for MT evaluation with improved correlation with human judgments. In: The ACL Workshop on IEEM for MTS; Ann Arbor USA; 2005. p. 65\u201372."},{"key":"e_1_3_3_38_1","unstructured":"Lin CY. ROUGE: a package for automatic evaluation of summaries. In: Text summarization branches out; Barcelona Spain; 2004. p. 74\u201381."},{"key":"e_1_3_3_39_1","doi-asserted-by":"crossref","unstructured":"Vedantam R Lawrence Zitnick C Parikh D. CIDEr: consensus-based image description evaluation. In: CVPR; Boston USA; 2015. p. 4566\u20134575.","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_3_40_1","unstructured":"Ren S He K Girshick R et al. Faster R-CNN: towards real-time object detection with region proposal networks. In: NeurIPS; Quebec Canada; 2015."},{"key":"e_1_3_3_41_1","doi-asserted-by":"crossref","unstructured":"He K Zhang X Ren S et al. Deep residual learning for image recognition. In: CVPR; Las Vegas USA; 2016. p. 770\u2013778.","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_3_42_1","unstructured":"Vaswani A Shazeer N Parmar N et al. Attention is all you need. In: NIPS; Long Beach USA; 2017."},{"key":"e_1_3_3_43_1","unstructured":"Radford A Kim W Hallacy C et\u00a0al. Learning transferable visual models from natural language supervision. In: ICML; 2021. p. 8748\u20138763."},{"key":"e_1_3_3_44_1","doi-asserted-by":"crossref","unstructured":"Inamura T Tan JTC Sugiura K et al. Development of robocup@ home simulation towards long-term large scale HRI. In: RoboCup 2013: Robot World Cup XVII 17; Eindhoven the Netherlands; 2014. p. 672\u2013680.","DOI":"10.1007\/978-3-662-44468-9_64"},{"key":"e_1_3_3_45_1","doi-asserted-by":"publisher","DOI":"10.1080\/01691864.2019.1663608"},{"key":"e_1_3_3_46_1","doi-asserted-by":"publisher","DOI":"10.1016\/S0893-6080(98)00010-0"}],"container-title":["Advanced Robotics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.tandfonline.com\/doi\/pdf\/10.1080\/01691864.2024.2388114","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,17]],"date-time":"2024-10-17T17:40:04Z","timestamp":1729186804000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.tandfonline.com\/doi\/full\/10.1080\/01691864.2024.2388114"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,9]]},"references-count":45,"journal-issue":{"issue":"18","published-print":{"date-parts":[[2024,9,16]]}},"alternative-id":["10.1080\/01691864.2024.2388114"],"URL":"https:\/\/doi.org\/10.1080\/01691864.2024.2388114","relation":{},"ISSN":["0169-1864","1568-5535"],"issn-type":[{"type":"print","value":"0169-1864"},{"type":"electronic","value":"1568-5535"}],"subject":[],"published":{"date-parts":[[2024,8,9]]},"assertion":[{"value":"The publishing and review policy for this title is described in its Aims & Scope.","order":1,"name":"peerreview_statement","label":"Peer Review Statement"},{"value":"http:\/\/www.tandfonline.com\/action\/journalInformation?show=aimsScope&journalCode=tadr20","URL":"http:\/\/www.tandfonline.com\/action\/journalInformation?show=aimsScope&journalCode=tadr20","order":2,"name":"aims_and_scope_url","label":"Aim & Scope"},{"value":"2024-01-31","order":0,"name":"received","label":"Received","group":{"name":"publication_history","label":"Publication History"}},{"value":"2024-05-10","order":1,"name":"revised","label":"Revised","group":{"name":"publication_history","label":"Publication History"}},{"value":"2024-07-04","order":2,"name":"accepted","label":"Accepted","group":{"name":"publication_history","label":"Publication History"}},{"value":"2024-08-09","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}