{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T00:07:10Z","timestamp":1755907630740,"version":"3.44.0"},"reference-count":60,"publisher":"Informa UK Limited","issue":"13","funder":[{"DOI":"10.13039\/501100001691","name":"JSPS","doi-asserted-by":"publisher","award":["23H03478","JP23KJ1917"],"award-info":[{"award-number":["23H03478","JP23KJ1917"]}],"id":[{"id":"10.13039\/501100001691","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["www.tandfonline.com"],"crossmark-restriction":true},"short-container-title":["Advanced Robotics"],"published-print":{"date-parts":[[2025,7,3]]},"DOI":"10.1080\/01691864.2025.2532610","type":"journal-article","created":{"date-parts":[[2025,7,17]],"date-time":"2025-07-17T14:31:03Z","timestamp":1752762663000},"page":"806-816","update-policy":"https:\/\/doi.org\/10.1080\/tandf_crossmark_01","source":"Crossref","is-referenced-by-count":0,"title":["Pre-manipulation alignment prediction with parallel deep state-space and transformer models"],"prefix":"10.1080","volume":"39","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1991-9119","authenticated-orcid":false,"given":"Motonari","family":"Kambara","sequence":"first","affiliation":[{"name":"Keio University","place":["Yokohama, Japan"]}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0261-0510","authenticated-orcid":false,"given":"Komei","family":"Sugiura","sequence":"additional","affiliation":[{"name":"Keio University","place":["Yokohama, Japan"]}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"301","published-online":{"date-parts":[[2025,7,17]]},"reference":[{"key":"e_1_3_2_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10514-023-10139-z"},{"key":"e_1_3_2_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2017.2655622"},{"key":"e_1_3_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/Access.6287639"},{"key":"e_1_3_2_5_1","unstructured":"Goko M Kambara M Saito D et\u00a0al. Task success prediction for open-vocabulary manipulation based on multi-level aligned representations. In: 8th Annual Conference on Robot Learning; Munich; 2024."},{"key":"e_1_3_2_6_1","unstructured":"Xiao T Chan H Sermanet P et\u00a0al. Skill acquisition by instruction augmentation on offline datasets. In: NeurIPS 2022 Foundation Models for Decision Making Workshop; New Orleans; 2022."},{"key":"e_1_3_2_7_1","unstructured":"Liu Z Bahety A Song S. REFLECT: Summarizing robot experiences for failure explanation and correction. In: Conference on Robot Learning (CoRL); Atlanta; 2023. p.\u00a03468\u20133484."},{"key":"e_1_3_2_8_1","doi-asserted-by":"crossref","unstructured":"Inceoglu A Aksoy EE Ak AC et\u00a0al. Fino-Net: a deep multimodal sensor fusion framework for manipulation failure detection. In: 2021 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS); Prague; 2021. p.\u00a06841\u20136847.","DOI":"10.1109\/IROS51168.2021.9636455"},{"key":"e_1_3_2_9_1","unstructured":"Driess D Xia F Sajjadi MS et\u00a0al. PaLM-E: an embodied multimodal language model. In: Proceedings of the 40th International Conference on Machine Learning; Honolulu; 2023. p.\u00a08469\u20138488."},{"key":"e_1_3_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2022.3188889"},{"key":"e_1_3_2_11_1","doi-asserted-by":"crossref","unstructured":"Das D Chernova S. Semantic-based explainable AI: leveraging semantic scene graphs and pairwise ranking to explain robot failures. In: 2021 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS); Prague; 2021. p.\u00a03034\u20133041.","DOI":"10.1109\/IROS51168.2021.9635890"},{"key":"e_1_3_2_12_1","doi-asserted-by":"crossref","unstructured":"Ghosh D Walke H et\u00a0al. Octo Model Team Octo: an open-source generalist robot policy. In: Robotics: Science and Systems; Delft; 2024.","DOI":"10.15607\/RSS.2024.XX.090"},{"key":"e_1_3_2_13_1","doi-asserted-by":"crossref","unstructured":"Brohan A Brown N Carbajal J et\u00a0al. RT-1: robotics transformer for real-world control at scale. Preprint; 2022. Available at arXiv:221206817.","DOI":"10.15607\/RSS.2023.XIX.025"},{"key":"e_1_3_2_14_1","unstructured":"Kim M Pertsch K Karamcheti S et\u00a0al. OpenVLA: an open-source vision-language-action model. In: CoRL; Munich; 2024."},{"key":"e_1_3_2_15_1","unstructured":"Black K Brown N Driess D et\u00a0al. \u03c00: a vision-language-action flow model for general robot control. Preprint; 2024. Available at arXiv:241024164."},{"key":"e_1_3_2_16_1","unstructured":"Vaswani A Shazeer N Parmar N et\u00a0al. Attention Is All You Need. In: 31st Conference on Neural Information Processing Systems (NIPS 2017); Long Beach; 2017."},{"key":"e_1_3_2_17_1","unstructured":"Gu A Goel K R\u00e9 C. Efficiently modeling long sequences with structured state spaces. In: ICLR; Virtual; 2022."},{"key":"e_1_3_2_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/TRO.2017.2723903"},{"key":"e_1_3_2_19_1","doi-asserted-by":"crossref","unstructured":"Mottaghi R Rastegari M Gupta A et\u00a0al. \u201cWhat Happens If\u2026\u201d learning to predict the effect of forces in images. In: European Conference on Computer Vision; Amsterdam; 2016. p.\u00a0269\u2013285.","DOI":"10.1007\/978-3-319-46493-0_17"},{"key":"e_1_3_2_20_1","doi-asserted-by":"publisher","DOI":"10.1080\/01691864.2021.1913446"},{"key":"e_1_3_2_21_1","doi-asserted-by":"crossref","unstructured":"Kambara M Sugiura K. Relational future captioning model for explaining likely collisions in daily tasks. In: 2022 IEEE International Conference on Image Processing (ICIP); Bordeaux; 2022. p.\u00a02601\u20132605.","DOI":"10.1109\/ICIP46576.2022.9897231"},{"key":"e_1_3_2_22_1","doi-asserted-by":"crossref","unstructured":"Liu H Dass S Mart\u00edn-Mart\u00edn R et\u00a0al. Model-based runtime monitoring with interactive imitation learning. In: 2024 IEEE International Conference on Robotics and Automation (ICRA); Yokohama; 2024. p.\u00a04154\u20134161.","DOI":"10.1109\/ICRA57147.2024.10611038"},{"key":"e_1_3_2_23_1","doi-asserted-by":"crossref","unstructured":"Kawaharazuka K Matsushima T Gambardella A et\u00a0al. Real-world robot applications of foundation models: a review. Preprint; 2024. Available at arXiv:240205741.","DOI":"10.1080\/01691864.2024.2408593"},{"key":"e_1_3_2_24_1","unstructured":"Brohan A Chebotar Y Finn C et\u00a0al. Do As I Can Not As I Say: grounding language in robotic affordances. In: Conference on Robot Learning; Atlanta; 2023. p.\u00a0287\u2013318."},{"key":"e_1_3_2_25_1","doi-asserted-by":"crossref","unstructured":"Shirasaka M Matsushima T Tsunashima S et\u00a0al. Self-recovery prompting: promptable general purpose service robot system with foundation models and self-recovery. In: 2024 IEEE International Conference on Robotics and Automation (ICRA); Yokohama; 2024.","DOI":"10.1109\/ICRA57147.2024.10611640"},{"key":"e_1_3_2_26_1","unstructured":"Driess D Xia F Sajjadi M et\u00a0al. PaLM-E: an embodied multimodal language model. In: Proceedings of the 40th International Conference on Machine Learning; Honolulu; 2023. p.\u00a08469\u20138488."},{"key":"e_1_3_2_27_1","doi-asserted-by":"crossref","unstructured":"Zha L Cui Y Lin LH et\u00a0al. Distilling and retrieving generalizable knowledge for robot manipulation via language corrections. In: 2024 IEEE International Conference on Robotics and Automation (ICRA); Yokohama; 2024. p.\u00a015172\u201315179.","DOI":"10.1109\/ICRA57147.2024.10610455"},{"key":"e_1_3_2_28_1","doi-asserted-by":"crossref","unstructured":"Sun L Jha D Hori C et\u00a0al. Interactive planning using large language models for partially observable robotics tasks. In: 2024 IEEE International Conference on Robotics and Automation (ICRA); Yokohama; 2024.","DOI":"10.1109\/ICRA57147.2024.10610981"},{"key":"e_1_3_2_29_1","doi-asserted-by":"crossref","unstructured":"Zhang J Huang Z Ray A et\u00a0al. Feedback-guided autonomous driving. In: 2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR); Seattle; 2024. p.\u00a015000\u201315011.","DOI":"10.1109\/CVPR52733.2024.01421"},{"key":"e_1_3_2_30_1","unstructured":"Zhi P Zhang Z Han M et\u00a0al. Closed-loop open-vocabulary mobile manipulation with GPT-4V. Preprint; 2024. Available at arXiv:240410220."},{"key":"e_1_3_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2024.3441493"},{"key":"e_1_3_2_32_1","unstructured":"Xiong C Shen C Li X et\u00a0al. Autonomous interactive correction MLLM for robust robotic manipulation. In: 8th Annual Conference on Robot Learning; Munich; 2024."},{"key":"e_1_3_2_33_1","doi-asserted-by":"crossref","unstructured":"Sermanet P Ding T Zhao J et\u00a0al. RoboVQA: multimodal long-horizon reasoning for robotics. In: IEEE International Conference on Robotics and Automation (ICRA); Yokohama; 2024. p.\u00a0645\u2013652.","DOI":"10.1109\/ICRA57147.2024.10610216"},{"key":"e_1_3_2_34_1","unstructured":"Duan J Pumacay W Kumar N et\u00a0al. AHA: a vision-language-model for detecting and reasoning over failures in robotic manipulation. In: 1st Workshop on X-Embodiment Robot Learning; Munich; 2024."},{"key":"e_1_3_2_35_1","doi-asserted-by":"crossref","unstructured":"Antol S Agrawal A Lu J et\u00a0al. VQA: visual question answering. In: 2015 IEEE International Conference on Computer Vision (ICCV); Santiago; 2015. p.\u00a02425\u20132433.","DOI":"10.1109\/ICCV.2015.279"},{"key":"e_1_3_2_36_1","doi-asserted-by":"crossref","unstructured":"Yin S Fu C Zhao S et\u00a0al. A survey on multimodal large language models. Preprint; 2023. Available at arXiv:230613549.","DOI":"10.1093\/nsr\/nwae403"},{"key":"e_1_3_2_37_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2017.05.001"},{"key":"e_1_3_2_38_1","unstructured":"Wang P Bai S Tan S et\u00a0al. Qwen2-VL: enhancing vision-language model's perception of the world at any resolution. Preprint; 2024. Available at arXiv:240912191."},{"key":"e_1_3_2_39_1","unstructured":"Li J Li D Savarese S et\u00a0al. BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: Proceedings of the 40th International Conference on Machine Learning; Honolulu; 2023. p.\u00a019730\u201319742."},{"key":"e_1_3_2_40_1","unstructured":"Bai J Bai S Yang S et\u00a0al. Qwen-VL: a frontier large vision-language model with versatile abilities. Preprint; 2023. Available at arXiv:230812966."},{"key":"e_1_3_2_41_1","doi-asserted-by":"crossref","unstructured":"Wang W Bao H Dong L et\u00a0al. Image as a foreign language: beit pretraining for vision and vision-language tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition; Vancouver; 2023. p.\u00a019175\u201319186.","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"e_1_3_2_42_1","doi-asserted-by":"crossref","unstructured":"Liu H Li C Li Y et\u00a0al. Improved baselines with visual instruction tuning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition; Seattle; 2024. p.\u00a026296\u201326306.","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"e_1_3_2_43_1","unstructured":"Liu H Li C Wu Q et\u00a0al. Visual instruction tuning. In: Advances in Neural Information Processing Systems (NeurIPS 2023); New Orleans; 2023. p.\u00a034892\u201334916."},{"key":"e_1_3_2_44_1","unstructured":"Dai W Li J Li D et\u00a0al. InstructBLIP: towards general-purpose vision-language models with instruction tuning. In: Proceedings of the 37th International Conference on Neural Information Processing Systems; New Orleans; 2023. p.\u00a049250\u201349267."},{"key":"e_1_3_2_45_1","doi-asserted-by":"crossref","unstructured":"Hudson DA Manning CD. A new dataset for real-world visual reasoning and compositional question answering. In: 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR); Long Beach; 2019. p.\u00a06700\u20136709.","DOI":"10.1109\/CVPR.2019.00686"},{"key":"e_1_3_2_46_1","doi-asserted-by":"crossref","unstructured":"Goyal Y Khot T Summers-Stay D et\u00a0al. Making the V in VQA matter: elevating The role of image understanding in visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition; Honolulu; 2017. p.\u00a06904\u20136913.","DOI":"10.1109\/CVPR.2017.670"},{"key":"e_1_3_2_47_1","first-page":"572","article-title":"Combining recurrent, convolutional, and continuous-time models with linear state-space layers","volume":"34","author":"Gu A","year":"2021","unstructured":"Gu A, Johnson I, Goel K, et\u00a0al. Combining recurrent, convolutional, and continuous-time models with linear state-space layers. Adv Neural Inf Process Syst. 2021;34:572\u2013585.","journal-title":"Adv Neural Inf Process Syst"},{"key":"e_1_3_2_48_1","first-page":"35971","article-title":"On the parameterization and initialization of diagonal state space models","volume":"35","author":"Gu A","year":"2022","unstructured":"Gu A, Gupta A, Goel K, et\u00a0al. On the parameterization and initialization of diagonal state space models. Adv Neural Inf Process Syst. 2022;35:35971\u201335983.","journal-title":"Adv Neural Inf Process Syst"},{"key":"e_1_3_2_49_1","unstructured":"Smith JTH Warrington A Linderman SW. Simplified state space layers for sequence modeling. In: ICLR; Kigali; 2023."},{"key":"e_1_3_2_50_1","doi-asserted-by":"publisher","DOI":"10.1016\/S0893-6080(05)80125-X"},{"key":"e_1_3_2_51_1","unstructured":"Tallec C Ollivier Y. Can recurrent neural networks warp time? In: ICLR; Vancouver; 2018."},{"key":"e_1_3_2_52_1","unstructured":"Gu A Dao T. Mamba: linear-time sequence modeling with selective state spaces. In: First Conference on Language Modeling; Philadelphia; 2024."},{"key":"e_1_3_2_53_1","unstructured":"Zhu L Liao B Zhang Q et\u00a0al. Vision mamba: efficient visual representation learning with bidirectional state space model. In: Proceedings of the 41st International Conference on Machine Learning; Vienna; 2024."},{"key":"e_1_3_2_54_1","doi-asserted-by":"crossref","unstructured":"Zhang Z Chong KT Comparison between first-order hold with zero-order hold in discretization of input-delay nonlinear systems. In: 2007 International Conference on Control Automation and Systems; Seoul; 2007. p.\u00a02892\u20132896.","DOI":"10.1109\/ICCAS.2007.4406863"},{"key":"e_1_3_2_55_1","unstructured":"Stella en 1.5B v5; 2024 [accessed 2024 Dec]. Available at https:\/\/huggingface.co\/dunzhang\/stella_en_1.5B_v5."},{"key":"e_1_3_2_56_1","first-page":"1","article-title":"DINOv2: learning robust visual features without supervision","author":"Oquab M","year":"2024","unstructured":"Oquab M, Darcet T, Moutakanni T, et\u00a0al. DINOv2: learning robust visual features without supervision. TMLR. 2024;1\u201331.","journal-title":"TMLR"},{"key":"e_1_3_2_57_1","unstructured":"OpenAI. GPT-4o; [accessed 2024 Nov]. Available at https:\/\/platform.openai.com\/docs\/models\/gpt-4o."},{"key":"e_1_3_2_58_1","unstructured":"Kuang Y Ye J Geng H et\u00a0al. RAM: retrieval-based affordance transfer for generalizable zero-shot robotic manipulation. In: 8th Annual Conference on Robot Learning; Munich; 2024."},{"key":"e_1_3_2_59_1","doi-asserted-by":"crossref","unstructured":"Bahl S Mendonca R Chen L et\u00a0al. Affordances from human videos as a versatile representation for robotics. In: 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR); Vancouver; 2023. p.\u00a013778\u201313790.","DOI":"10.1109\/CVPR52729.2023.01324"},{"key":"e_1_3_2_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2025.3527290"},{"key":"e_1_3_2_61_1","doi-asserted-by":"crossref","unstructured":"Zhou X Girdhar R Joulin A et\u00a0al. Detecting twenty-thousand classes using image-level supervision. In: Computer Vision \u2013 ECCV 2022; Tel Aviv; 2022. p.\u00a0350\u2013368.","DOI":"10.1007\/978-3-031-20077-9_21"}],"container-title":["Advanced Robotics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.tandfonline.com\/doi\/pdf\/10.1080\/01691864.2025.2532610","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T12:43:09Z","timestamp":1755866589000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.tandfonline.com\/doi\/full\/10.1080\/01691864.2025.2532610"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,3]]},"references-count":60,"journal-issue":{"issue":"13","published-print":{"date-parts":[[2025,7,3]]}},"alternative-id":["10.1080\/01691864.2025.2532610"],"URL":"https:\/\/doi.org\/10.1080\/01691864.2025.2532610","relation":{},"ISSN":["0169-1864","1568-5535"],"issn-type":[{"type":"print","value":"0169-1864"},{"type":"electronic","value":"1568-5535"}],"subject":[],"published":{"date-parts":[[2025,7,3]]},"assertion":[{"value":"The publishing and review policy for this title is described in its Aims & Scope.","order":1,"name":"peerreview_statement","label":"Peer Review Statement"},{"value":"http:\/\/www.tandfonline.com\/action\/journalInformation?show=aimsScope&journalCode=tadr20","URL":"http:\/\/www.tandfonline.com\/action\/journalInformation?show=aimsScope&journalCode=tadr20","order":2,"name":"aims_and_scope_url","label":"Aim & Scope"},{"value":"2025-01-13","order":0,"name":"received","label":"Received","group":{"name":"publication_history","label":"Publication History"}},{"value":"2025-05-15","order":1,"name":"revised","label":"Revised","group":{"name":"publication_history","label":"Publication History"}},{"value":"2025-05-21","order":2,"name":"accepted","label":"Accepted","group":{"name":"publication_history","label":"Publication History"}},{"value":"2025-07-17","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}