{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,6]],"date-time":"2025-05-06T16:40:03Z","timestamp":1746549603846,"version":"3.40.5"},"reference-count":31,"publisher":"Informa UK Limited","issue":"7","license":[{"start":{"date-parts":[[2025,4,3]],"date-time":"2025-04-03T00:00:00Z","timestamp":1743638400000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["www.tandfonline.com"],"crossmark-restriction":true},"short-container-title":["Advanced Robotics"],"published-print":{"date-parts":[[2025,4,3]]},"DOI":"10.1080\/01691864.2025.2487608","type":"journal-article","created":{"date-parts":[[2025,4,18]],"date-time":"2025-04-18T14:38:56Z","timestamp":1744987136000},"page":"323-337","update-policy":"https:\/\/doi.org\/10.1080\/tandf_crossmark_01","source":"Crossref","is-referenced-by-count":0,"title":["Situation classification of living environment by daily life support robot using pre-trained large-scale vision-language model"],"prefix":"10.1080","volume":"39","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1429-4401","authenticated-orcid":false,"given":"Yoshiki","family":"Obinata","sequence":"first","affiliation":[{"name":"The University of Tokyo","place":["Bunkyo-ku, Japan"]}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7464-7187","authenticated-orcid":false,"given":"Kento","family":"Kawaharazuka","sequence":"additional","affiliation":[{"name":"The University of Tokyo","place":["Bunkyo-ku, Japan"]}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-3527-3087","authenticated-orcid":false,"given":"Naoaki","family":"Kanazawa","sequence":"additional","affiliation":[{"name":"The University of Tokyo","place":["Bunkyo-ku, Japan"]}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1790-1502","authenticated-orcid":false,"given":"Naoya","family":"Yamaguchi","sequence":"additional","affiliation":[{"name":"The University of Tokyo","place":["Bunkyo-ku, Japan"]}]},{"given":"Naoto","family":"Tsukamoto","sequence":"additional","affiliation":[{"name":"The University of Tokyo","place":["Bunkyo-ku, Japan"]}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-4433-0013","authenticated-orcid":false,"given":"Iori","family":"Yanokura","sequence":"additional","affiliation":[{"name":"The University of Tokyo","place":["Bunkyo-ku, Japan"]}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5096-0702","authenticated-orcid":false,"given":"Shingo","family":"Kitagawa","sequence":"additional","affiliation":[{"name":"The University of Tokyo","place":["Bunkyo-ku, Japan"]}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6606-6692","authenticated-orcid":false,"given":"Kei","family":"Okada","sequence":"additional","affiliation":[{"name":"The University of Tokyo","place":["Bunkyo-ku, Japan"]}]},{"given":"Masayuki","family":"Inaba","sequence":"additional","affiliation":[{"name":"The University of Tokyo","place":["Bunkyo-ku, Japan"]}]}],"member":"301","published-online":{"date-parts":[[2025,4,18]]},"reference":[{"key":"e_1_3_2_2_1","doi-asserted-by":"publisher","DOI":"10.7210\/jrsj.26.330"},{"key":"e_1_3_2_3_1","doi-asserted-by":"crossref","unstructured":"Okada K Ogura T Haneda A et al. Humanoid motion generation system on HRP2-JSK for daily life environment. In: IEEE International Conference Mechatronics and Automation; Niagara Falls ON Canada; 2005. Vol. 4 p. 1772\u20131777.","DOI":"10.1109\/ICMA.2005.1626828"},{"key":"e_1_3_2_4_1","doi-asserted-by":"crossref","unstructured":"Yamazaki K Ueda R Nozawa S et al. System integration of a daily assistive robot and its application to tidying and cleaning rooms. In: IEEE\/RSJ International Conference on Intelligent Robots and Systems; Taipei Taiwan; 2010. p. 1365\u20131371.","DOI":"10.1109\/IROS.2010.5653614"},{"key":"e_1_3_2_5_1","doi-asserted-by":"publisher","DOI":"10.1518\/001872095779049543"},{"key":"e_1_3_2_6_1","unstructured":"Vaswani A Shazeer N Parmar N et al. Attention is all you need. In: Advances in Neural Information Processing Systems 30; Long Beach California USA; 2017."},{"key":"e_1_3_2_7_1","unstructured":"Wang P Yang A Men R et al. OFA: unifying architectures tasks and modalities through a simple sequence-to-sequence learning framework. In: International Conference on Machine Learning; Baltimore MD USA; 2022. p. 23318\u201323340."},{"key":"e_1_3_2_8_1","doi-asserted-by":"crossref","unstructured":"Li L Zhang P Zhang H et al. Grounded language-image pre-training. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition; New Orleans Louisiana USA; 2022. p. 10965\u201310975.","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"e_1_3_2_9_1","doi-asserted-by":"crossref","unstructured":"Antol S Agrawal A Lu J et al. VQA: visual question answering. In: IEEE International Conference on Computer Vision; Santiago Chile; 2015. p. 2425\u20132433.","DOI":"10.1109\/ICCV.2015.279"},{"key":"e_1_3_2_10_1","doi-asserted-by":"crossref","unstructured":"Kawaharazuka K Obinata Y Kanazawa N et al. Robotic applications of pre-trained vision-language models to various recognition behaviors. In: IEEE-RAS 22nd International Conference on Humanoid Robots; Austin TX USA; 2023. p. 1\u20138.","DOI":"10.1109\/Humanoids57100.2023.10375211"},{"key":"e_1_3_2_11_1","doi-asserted-by":"crossref","unstructured":"Huang C Mees O Zeng A et al. Visual language maps for robot navigation. In: IEEE International Conference on Robotics and Automation; London UK; 2023. p. 10608\u201310615.","DOI":"10.1109\/ICRA48891.2023.10160969"},{"key":"e_1_3_2_12_1","unstructured":"Brohan A Brown N Carbajal J et\u00a0al. RT-2: vision-language-action models transfer web knowledge to robotic control 2023. arXiv:2307.15818."},{"key":"e_1_3_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2007.40"},{"key":"e_1_3_2_14_1","doi-asserted-by":"publisher","DOI":"10.7210\/jrsj.31.918"},{"key":"e_1_3_2_15_1","article-title":"Deep learning for scene classification: A survey","author":"Zeng D","year":"2021","unstructured":"Zeng D, Liao M, Tavakolian M, et al. Deep learning for scene classification: A survey. arXiv preprint arXiv:2101.1053. 2021.","journal-title":"arXiv preprint arXiv:2101.1053"},{"key":"e_1_3_2_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2017.2675998"},{"key":"e_1_3_2_17_1","doi-asserted-by":"publisher","DOI":"10.1023\/A:1007617005950"},{"key":"e_1_3_2_18_1","doi-asserted-by":"crossref","unstructured":"Bosch A Zisserman A Munoz X. Scene classification via pLSA. In: European Conference on Computer Vision; Graz (Austria): Springer; 2006. p. 517\u2013530.","DOI":"10.1007\/11744085_40"},{"key":"e_1_3_2_19_1","doi-asserted-by":"crossref","unstructured":"Fei-Fei L Perona P. A Bayesian hierarchical model for learning natural scene categories. In: IEEE Computer Society Conference on Computer Vision and Pattern Recognition; San Diego California USA; 2005. Vol. 2 p. 524\u2013531.","DOI":"10.1109\/CVPR.2005.16"},{"key":"e_1_3_2_20_1","doi-asserted-by":"publisher","DOI":"10.1080\/01431161.2012.705443"},{"key":"e_1_3_2_21_1","unstructured":"Chen B Sahdev R Wu D et al. Scene classification in indoor environments for robots using context based word embeddings. In: IEEE International Conference on Robotics and Automation Workshop; Montreal Canada; 2019."},{"key":"e_1_3_2_22_1","doi-asserted-by":"crossref","unstructured":"Obinata Y Kawaharazuka K Kanazawa N et al. Semantic scene difference detection in daily life patrolling by mobile robots using pre-trained large-scale vision-language model. In: IEEE\/RSJ International Conference on Intelligent Robots and Systems; Detroit Michigan USA; 2023. p. 3228\u20133233.","DOI":"10.1109\/IROS55552.2023.10342467"},{"key":"e_1_3_2_23_1","unstructured":"Song K Tan X Qin T et\u00a0al. MPNet: masked and permuted pre-training for language understanding. In: Advances in Neural Information Processing Systems 33; 2020. p.\u00a016857\u201316867."},{"key":"e_1_3_2_24_1","unstructured":"Sentence-transformers\/all-mpnet-base-v2. Available from: https:\/\/huggingface.co\/sentence-transformers\/all-mpnet-base-v2. [Online; accessed 17-September-2022]."},{"key":"e_1_3_2_25_1","doi-asserted-by":"publisher","DOI":"10.1002\/9780470316801"},{"key":"e_1_3_2_26_1","doi-asserted-by":"crossref","unstructured":"Zhou X Girdhar R Joulin A et al. Detecting twenty-thousand classes using image-level supervision. In: European Conference on Computer Vision; Tel Aviv (Israel): Springer; 2022. p. 350\u2013368.","DOI":"10.1007\/978-3-031-20077-9_21"},{"key":"e_1_3_2_27_1","unstructured":"Wise M Ferguson M King D et al. Fetch & freight: standard platforms for service robot applications. In: Workshop on Autonomous Mobile Service Robots; New York City USA; 2016. p. 1\u20136."},{"key":"e_1_3_2_28_1","doi-asserted-by":"crossref","unstructured":"Obinata Y Yanokura I Tsukamoto N et al. System for teaching robot action instructions and responding to situations using a chat application. In: International Conference on Intelligent Autonomous Systems; Suwon (Korea): Springer; 2023. p. 375\u2013384.","DOI":"10.1007\/978-3-031-44981-9_31"},{"key":"e_1_3_2_29_1","unstructured":"Brown T Mann B Ryder N et\u00a0al. Language models are few-shot learners. In: Advances in Neural Information Processing Systems 33; Curran Associates Inc.; 2020. p.\u00a01877\u20131901."},{"key":"e_1_3_2_30_1","doi-asserted-by":"crossref","unstructured":"Huang R Peng S Takmaz A et al. Segment3D: learning fine-grained class-agnostic 3D segmentation without manual labels. In: European Conference on Computer Vision; Milan Italy; 2024.","DOI":"10.1007\/978-3-031-72754-2_16"},{"key":"e_1_3_2_31_1","doi-asserted-by":"crossref","unstructured":"Peng S Genova K Jiang C et al. OpenScene: 3D scene understanding with open vocabularies. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition; Vancouver Canada; 2023. p. 815\u2013824.","DOI":"10.1109\/CVPR52729.2023.00085"},{"key":"e_1_3_2_32_1","unstructured":"Obinata Y Kanazawa N Kawaharazuka K et\u00a0al. Foundation model based open vocabulary task planning and executive system for general purpose service robots 2023 arXiv:2308.03357."}],"container-title":["Advanced Robotics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.tandfonline.com\/doi\/pdf\/10.1080\/01691864.2025.2487608","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,6]],"date-time":"2025-05-06T16:00:39Z","timestamp":1746547239000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.tandfonline.com\/doi\/full\/10.1080\/01691864.2025.2487608"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,3]]},"references-count":31,"journal-issue":{"issue":"7","published-print":{"date-parts":[[2025,4,3]]}},"alternative-id":["10.1080\/01691864.2025.2487608"],"URL":"https:\/\/doi.org\/10.1080\/01691864.2025.2487608","relation":{},"ISSN":["0169-1864","1568-5535"],"issn-type":[{"type":"print","value":"0169-1864"},{"type":"electronic","value":"1568-5535"}],"subject":[],"published":{"date-parts":[[2025,4,3]]},"assertion":[{"value":"The publishing and review policy for this title is described in its Aims & Scope.","order":1,"name":"peerreview_statement","label":"Peer Review Statement"},{"value":"http:\/\/www.tandfonline.com\/action\/journalInformation?show=aimsScope&journalCode=tadr20","URL":"http:\/\/www.tandfonline.com\/action\/journalInformation?show=aimsScope&journalCode=tadr20","order":2,"name":"aims_and_scope_url","label":"Aim & Scope"},{"value":"2024-02-29","order":0,"name":"received","label":"Received","group":{"name":"publication_history","label":"Publication History"}},{"value":"2024-11-13","order":1,"name":"revised","label":"Revised","group":{"name":"publication_history","label":"Publication History"}},{"value":"2025-03-03","order":2,"name":"accepted","label":"Accepted","group":{"name":"publication_history","label":"Publication History"}},{"value":"2025-04-18","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}