{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,18]],"date-time":"2026-04-18T16:40:25Z","timestamp":1776530425608,"version":"3.51.2"},"reference-count":54,"publisher":"Informa UK Limited","issue":"5","funder":[{"DOI":"10.13039\/501100001691","name":"JSPS","doi-asserted-by":"publisher","award":["23H03478"],"award-info":[{"award-number":["23H03478"]}],"id":[{"id":"10.13039\/501100001691","id-type":"DOI","asserted-by":"publisher"}]},{"name":"JST Moonshot"},{"DOI":"10.13039\/501100001863","name":"NEDO","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001863","id-type":"DOI","asserted-by":"publisher"}]},{"name":"JST BOOST","award":["JPMJBS2409"],"award-info":[{"award-number":["JPMJBS2409"]}]}],"content-domain":{"domain":["www.tandfonline.com"],"crossmark-restriction":true},"short-container-title":["Advanced Robotics"],"published-print":{"date-parts":[[2025,3,4]]},"DOI":"10.1080\/01691864.2025.2469689","type":"journal-article","created":{"date-parts":[[2025,3,4]],"date-time":"2025-03-04T17:37:41Z","timestamp":1741109861000},"page":"243-258","update-policy":"https:\/\/doi.org\/10.1080\/tandf_crossmark_01","source":"Crossref","is-referenced-by-count":2,"title":["DM\n            <sup>2<\/sup>\n            RM: dual-mode multimodal ranking for target objects and receptacles based on open-vocabulary instructions"],"prefix":"10.1080","volume":"39","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-0354-9070","authenticated-orcid":false,"given":"Ryosuke","family":"Korekata","sequence":"first","affiliation":[{"name":"Keio University","place":["Yokohama, Japan"]}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-3879-645X","authenticated-orcid":false,"given":"Kanta","family":"Kaneda","sequence":"additional","affiliation":[{"name":"Keio University","place":["Yokohama, Japan"]}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-9741-7628","authenticated-orcid":false,"given":"Shunya","family":"Nagashima","sequence":"additional","affiliation":[{"name":"Keio University","place":["Yokohama, Japan"]}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-4077-3627","authenticated-orcid":false,"given":"Yuto","family":"Imai","sequence":"additional","affiliation":[{"name":"Keio University","place":["Yokohama, Japan"]}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0261-0510","authenticated-orcid":false,"given":"Komei","family":"Sugiura","sequence":"additional","affiliation":[{"name":"Keio University","place":["Yokohama, Japan"]}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"301","published-online":{"date-parts":[[2025,3,4]]},"reference":[{"key":"e_1_3_4_2_1","doi-asserted-by":"publisher","DOI":"10.1186\/s40648-019-0132-3"},{"key":"e_1_3_4_3_1","doi-asserted-by":"publisher","DOI":"10.1561\/1500000016"},{"key":"e_1_3_4_4_1","doi-asserted-by":"crossref","unstructured":"Vo N Jiang L Sun C et al. Composing text and image for image retrieval-an empirical odyssey. In: CVPR; 2019. p. 6439\u20136448.\u00a0Long Beach CA: IEEE\/CVF.","DOI":"10.1109\/CVPR.2019.00660"},{"key":"e_1_3_4_5_1","doi-asserted-by":"crossref","unstructured":"Wu H Gao Y Guo X et al. Fashion IQ: a new dataset towards retrieving images by natural language feedback. In: CVPR; 2021. p. 11307\u201311317.\u00a0Virtual: IEEE\/CVF.","DOI":"10.1109\/CVPR46437.2021.01115"},{"key":"e_1_3_4_6_1","doi-asserted-by":"crossref","unstructured":"Chen W Yao L Jin Q. Rethinking benchmarks for cross-modal image-text retrieval. In: SIGIR; 2023. p. 1241\u20131251. Taipei: ACM.","DOI":"10.1145\/3539618.3591758"},{"key":"e_1_3_4_7_1","unstructured":"Yenamandra S Ramachandran A Khanna M et al. The homerobot open vocab mobile manipulation challenge. In: NeurIPS; 2023.\u00a0New Orleans Louisiana."},{"key":"e_1_3_4_8_1","unstructured":"Melnik A B\u00fcttner M Harz L et al. UniTeam: open vocabulary mobile manipulation challenge; 2023. arXiv preprint arXiv:231208611."},{"key":"e_1_3_4_9_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.artint.2015.08.002"},{"key":"e_1_3_4_10_1","doi-asserted-by":"publisher","DOI":"10.1080\/01691864.2019.1663608"},{"key":"e_1_3_4_11_1","unstructured":"Yenamandra S Ramachandran A Yadav K et al. HomeRobot: open-vocabulary mobile manipulation. In: CoRL; 2023. p. 1975\u20132011.\u00a0Atlanta Georgia: PMLR."},{"key":"e_1_3_4_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2024.3352363"},{"key":"e_1_3_4_13_1","unstructured":"Radford A Kim J Hallacy C et al. Learning transferable visual models from natural language supervision. In: ICML; 2021. p. 8748\u20138763.\u00a0Virtual: PMLR."},{"key":"e_1_3_4_14_1","doi-asserted-by":"crossref","unstructured":"Kirillov A Mintun E Ravi N et al. Segment anything. In: ICCV; 2023. p. 4015\u20134026.\u00a0Paris: IEEE\/CVF.","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_4_15_1","unstructured":"Liu Y Chen W Bai Y et al. Aligning cyber space with physical world: a comprehensive survey on embodied AI; 2024. arXiv preprint arXiv:240706886."},{"key":"e_1_3_4_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/TETCI.2022.3141105"},{"key":"e_1_3_4_17_1","doi-asserted-by":"crossref","unstructured":"Anderson P Wu Q Teney D et al. Vision-and-language navigation: interpreting visually-grounded navigation instructions in real environments. In: CVPR; 2018. p. 3674\u20133683.\u00a0Salt Lake City Utah: IEEE\/CVF.","DOI":"10.1109\/CVPR.2018.00387"},{"key":"e_1_3_4_18_1","doi-asserted-by":"crossref","unstructured":"Qi Y Wu Q Anderson P et al. REVERIE: remote embodied visual referring expression in real indoor environments. In: CVPR; 2020. p. 9982\u20139991.\u00a0Virtual: IEEE\/CVF.","DOI":"10.1109\/CVPR42600.2020.01000"},{"key":"e_1_3_4_19_1","doi-asserted-by":"crossref","unstructured":"Zhu F Liang X Zhu Y et al. SOON: scenario oriented object navigation with graph-based exploration. In: CVPR; 2021. p. 12689\u201312699.\u00a0Virtual: IEEE\/CVF.","DOI":"10.1109\/CVPR46437.2021.01250"},{"key":"e_1_3_4_20_1","doi-asserted-by":"crossref","unstructured":"Chang A Dai A Funkhouser T et al. Matterport3D: learning from RGB-D data in indoor environments. In: 3DV; 2017. p. 667\u2013676.\u00a0Qingdao: IEEE.","DOI":"10.1109\/3DV.2017.00081"},{"key":"e_1_3_4_21_1","unstructured":"Ramakrishnan S Gokaslan A Wijmans E et al. Habitat-matterport 3D dataset (HM3D): 1000 large-scale 3D environments for embodied AI. In: NeurIPS; 2021.\u00a0Virtual."},{"key":"e_1_3_4_22_1","doi-asserted-by":"crossref","unstructured":"Yadav K Ramrakhya R Ramakrishnan S et al. Habitat-matterport 3D semantics dataset. In: CVPR; 2023. p. 4927\u20134936.\u00a0Vancouver British Columbia: IEEE\/CVF.","DOI":"10.1109\/CVPR52729.2023.00477"},{"key":"e_1_3_4_23_1","doi-asserted-by":"crossref","unstructured":"Sigurdsson G Thomason J Sukhatme G et al. RREx-BoT: remote referring expressions with a bag of tricks. In: IROS; 2023. p. 5203\u20135210. Detroit MI: IEEE\/RSJ.","DOI":"10.1109\/IROS55552.2023.10342093"},{"key":"e_1_3_4_24_1","doi-asserted-by":"crossref","unstructured":"Chen B Xia F Ichter B et al. Open-vocabulary queryable scene representations for real world planning. In: ICRA; 2023. p. 11509\u201311522.\u00a0London: IEEE.","DOI":"10.1109\/ICRA48891.2023.10161534"},{"key":"e_1_3_4_25_1","unstructured":"Hu Y Xie Q Jain V et al. Toward general-purpose robots via foundation models: a survey and meta-analysis; 2023. arXiv preprint arXiv:231208782."},{"key":"e_1_3_4_26_1","doi-asserted-by":"crossref","unstructured":"Firoozi R Tucker J Tian S et al. Foundation models in robotics: applications challenges and the future; 2023. arXiv preprint arXiv:240205741.","DOI":"10.1177\/02783649241281508"},{"key":"e_1_3_4_27_1","doi-asserted-by":"crossref","unstructured":"Kawaharazuka K Matsushima T Gambardella A et al. Real-world robot applications of foundation models: a review; 2024. arXiv preprint arXiv:231207843.","DOI":"10.1080\/01691864.2024.2408593"},{"key":"e_1_3_4_28_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10514-023-10139-z"},{"key":"e_1_3_4_29_1","unstructured":"Driess D Xia F Sajjadi M et al. PaLM-E: an embodied multimodal language model. In: ICML; 2023. p. 8469\u20138488.\u00a0Honolulu Hawaii: PMLR."},{"key":"e_1_3_4_30_1","unstructured":"Ichter B Brohan A Chebotar Y et al. Do as i can not as i say: grounding language in robotic affordances. In: CoRL; 2023. p. 287\u2013318.\u00a0Atlanta Georgia: PMLR."},{"key":"e_1_3_4_31_1","doi-asserted-by":"crossref","unstructured":"Song C Wu J Washington C et al. LLM-planner: few-shot grounded planning for embodied agents with large language models. In: ICCV; 2023. p. 2998\u20133009.\u00a0Paris: IEEE\/CVF.","DOI":"10.1109\/ICCV51070.2023.00280"},{"key":"e_1_3_4_32_1","doi-asserted-by":"crossref","unstructured":"Hazra R Martires P Raedts L. SayCanPay: heuristic planning with large language models using learnable domain knowledge. In: AAAI; 2024. p. 20123\u201320133.\u00a0Vancouver British Columbia.","DOI":"10.1609\/aaai.v38i18.29991"},{"key":"e_1_3_4_33_1","doi-asserted-by":"crossref","unstructured":"Singh I Blukis V Mousavian A et al. ProgPrompt: generating situated robot task plans using large language models. In: ICRA; 2023. p. 11523\u201311530. London: IEEE.","DOI":"10.1109\/ICRA48891.2023.10161317"},{"key":"e_1_3_4_34_1","doi-asserted-by":"crossref","unstructured":"Liang J Huang W Xia F et al. Code as policies: language model programs for embodied control. In: ICRA; 2023. p. 9493\u20139500.\u00a0London: IEEE.","DOI":"10.1109\/ICRA48891.2023.10160591"},{"key":"e_1_3_4_35_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2021.07.009"},{"key":"e_1_3_4_36_1","doi-asserted-by":"publisher","DOI":"10.33093\/ipbss"},{"key":"e_1_3_4_37_1","doi-asserted-by":"crossref","unstructured":"Yu L Poirson P Yang S et al. Modeling context in referring expressions. In: ECCV; 2016. p. 69\u201385.\u00a0Amsterdam: Springer.","DOI":"10.1007\/978-3-319-46475-6_5"},{"key":"e_1_3_4_38_1","doi-asserted-by":"crossref","unstructured":"Hatori J Kikuchi Y Kobayashi S et al. Interactively picking real-world objects with unconstrained spoken language instructions. In: ICRA; 2018. p. 3774\u20133781.\u00a0Brisbane: IEEE.","DOI":"10.1109\/ICRA.2018.8460699"},{"key":"e_1_3_4_39_1","doi-asserted-by":"crossref","unstructured":"Korekata R Kambara M Yoshida Y et al. Switching head\u2013tail funnel uniter for dual referring expression comprehension with fetch-and-carry tasks. In: IROS; 2023. p. 3865\u20133872.\u00a0Detroit MI: IEEE\/RSJ.","DOI":"10.1109\/IROS55552.2023.10342165"},{"key":"e_1_3_4_40_1","doi-asserted-by":"crossref","unstructured":"Iioka Y Yoshida Y Wada Y et al. Multimodal diffusion segmentation model for object segmentation from manipulation instructions. In: IROS; 2023. p. 7590\u20137597.\u00a0Detroit MI: IEEE\/RSJ.","DOI":"10.1109\/IROS55552.2023.10341402"},{"key":"e_1_3_4_41_1","doi-asserted-by":"crossref","unstructured":"Guadarrama S Rodner E Saenko K et al. Open-vocabulary object retrieval. In: RSS; 2014. p. 1\u20139.\u00a0Berkeley California.","DOI":"10.15607\/RSS.2014.X.041"},{"key":"e_1_3_4_42_1","doi-asserted-by":"crossref","unstructured":"Nguyen T Gopalan N Patel R et al. Robot object retrieval with contextual natural language queries. In: RSS; 2020.\u00a0Virtual.","DOI":"10.15607\/RSS.2020.XVI.080"},{"key":"e_1_3_4_43_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"e_1_3_4_44_1","doi-asserted-by":"crossref","unstructured":"Liu Z Rodriguez-Opazo C Teney D et al. Image retrieval on real-life images with pre-trained vision-and-language models. In: ICCV; 2021. p. 2125\u20132134.\u00a0Virtual: IEEE\/CVF.","DOI":"10.1109\/ICCV48922.2021.00213"},{"key":"e_1_3_4_45_1","doi-asserted-by":"crossref","unstructured":"Han X Wu Z Huang P et al. automatic spatially-aware fashion concept discovery. In: ICCV; 2017. p. 1463\u20131471.\u00a0Venice: IEEE\/CVF","DOI":"10.1109\/ICCV.2017.163"},{"key":"e_1_3_4_46_1","doi-asserted-by":"crossref","unstructured":"Chen Y Li L Yu L et al. UNITER: UNiversal image-TExt representation learning. In: ECCV; 2020. p. 104\u2013120.\u00a0Virtual: Springer.","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"e_1_3_4_47_1","unstructured":"Schuster S Manning C. Enhanced english universal dependencies: an improved representation for natural language understanding tasks. In: LREC; 2016. p. 2371\u20132378.\u00a0Portoro\u017e: ELRA."},{"key":"e_1_3_4_48_1","article-title":"Attention is all you need","author":"Vaswani A","year":"2017","unstructured":"Vaswani A, Shazeer N, Parmar N, et al. Attention is all you need. Adv Neural Inf Process Syst. 2017.\u00a0Long Beach, California.","journal-title":"Adv Neural Inf Process Syst"},{"key":"e_1_3_4_49_1","unstructured":"Oord A Li Y Vinyals O. Representation learning with contrastive predictive coding; 2018. arXiv preprint arXiv:180703748."},{"key":"e_1_3_4_50_1","doi-asserted-by":"crossref","unstructured":"Savva M Kadian A Maksymets O et al. Habitat: a platform for embodied AI research. In: ICCV; 2019. p. 9339\u20139347.\u00a0Seoul: IEEE\/CVF.","DOI":"10.1109\/ICCV.2019.00943"},{"key":"e_1_3_4_51_1","doi-asserted-by":"crossref","unstructured":"Zhou X Girdhar R Joulin A et al. Detecting twenty-thousand classes using image-level supervision. In: ECCV; 2022. p. 350\u2013368.\u00a0Tel Aviv: Springer.","DOI":"10.1007\/978-3-031-20077-9_21"},{"key":"e_1_3_4_52_1","doi-asserted-by":"crossref","unstructured":"Liu C Ding H Jiang X. GRES: generalized referring expression segmentation. In: CVPR; 2023. p. 23592\u201323601. Vancouver British Columbia: IEEE\/CVF.","DOI":"10.1109\/CVPR52729.2023.02259"},{"issue":"11","key":"e_1_3_4_53_1","first-page":"2579","article-title":"Visualizing data using t-SNE","volume":"9","author":"Van der Maaten L","year":"2008","unstructured":"Van der Maaten L, Hinton G. Visualizing data using t-SNE. J\u00a0Mach Learn Res. 2008;9(11):2579\u20132605.","journal-title":"J\u00a0Mach Learn Res"},{"key":"e_1_3_4_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/MRA.2015.2448951"},{"key":"e_1_3_4_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2018.2849607"}],"container-title":["Advanced Robotics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.tandfonline.com\/doi\/pdf\/10.1080\/01691864.2025.2469689","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,15]],"date-time":"2025-03-15T09:35:59Z","timestamp":1742031359000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.tandfonline.com\/doi\/full\/10.1080\/01691864.2025.2469689"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,4]]},"references-count":54,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2025,3,4]]}},"alternative-id":["10.1080\/01691864.2025.2469689"],"URL":"https:\/\/doi.org\/10.1080\/01691864.2025.2469689","relation":{},"ISSN":["0169-1864","1568-5535"],"issn-type":[{"value":"0169-1864","type":"print"},{"value":"1568-5535","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,3,4]]},"assertion":[{"value":"The publishing and review policy for this title is described in its Aims & Scope.","order":1,"name":"peerreview_statement","label":"Peer Review Statement"},{"value":"http:\/\/www.tandfonline.com\/action\/journalInformation?show=aimsScope&journalCode=tadr20","URL":"http:\/\/www.tandfonline.com\/action\/journalInformation?show=aimsScope&journalCode=tadr20","order":2,"name":"aims_and_scope_url","label":"Aim & Scope"},{"value":"2024-08-27","order":0,"name":"received","label":"Received","group":{"name":"publication_history","label":"Publication History"}},{"value":"2025-01-07","order":1,"name":"revised","label":"Revised","group":{"name":"publication_history","label":"Publication History"}},{"value":"2025-01-31","order":2,"name":"accepted","label":"Accepted","group":{"name":"publication_history","label":"Publication History"}},{"value":"2025-03-04","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}