{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T05:48:13Z","timestamp":1751348893763,"version":"3.28.0"},"reference-count":38,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,10,1]],"date-time":"2023-10-01T00:00:00Z","timestamp":1696118400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,10,1]],"date-time":"2023-10-01T00:00:00Z","timestamp":1696118400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100000646","name":"JSPS KAKENHI","doi-asserted-by":"publisher","award":["20H04269\/23H03478"],"award-info":[{"award-number":["20H04269\/23H03478"]}],"id":[{"id":"10.13039\/501100000646","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,10,1]]},"DOI":"10.1109\/iros55552.2023.10341402","type":"proceedings-article","created":{"date-parts":[[2023,12,13]],"date-time":"2023-12-13T19:17:55Z","timestamp":1702495075000},"page":"7590-7597","source":"Crossref","is-referenced-by-count":4,"title":["Multimodal Diffusion Segmentation Model for Object Segmentation from Manipulation Instructions"],"prefix":"10.1109","author":[{"given":"Yui","family":"Iioka","sequence":"first","affiliation":[{"name":"Keio University,Yokohama, Kanagawa,Japan,223-8522"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yu","family":"Yoshida","sequence":"additional","affiliation":[{"name":"Keio University,Yokohama, Kanagawa,Japan,223-8522"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuiga","family":"Wada","sequence":"additional","affiliation":[{"name":"Keio University,Yokohama, Kanagawa,Japan,223-8522"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shumpei","family":"Hatanaka","sequence":"additional","affiliation":[{"name":"Keio University,Yokohama, Kanagawa,Japan,223-8522"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Komei","family":"Sugiura","sequence":"additional","affiliation":[{"name":"Keio University,Yokohama, Kanagawa,Japan,223-8522"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1186\/s40648-019-0132-3"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01000"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01762"},{"key":"ref4","article-title":"OFA: Unifying Architectures, Tasks, and Modalities Through a Simple Sequence-to-Sequence Learning Framework","volume":"abs\/2202.03052","author":"Wang","year":"2022","journal-title":"CoRR"},{"key":"ref5","article-title":"Label-Efficient Semantic Segmentation with Diffusion Models","author":"Baranchuk","year":"2022","journal-title":"ICLR"},{"key":"ref6","first-page":"6840","article-title":"Denoising Diffusion Probabilistic Models","volume":"33","author":"Ho","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref7","first-page":"8780","article-title":"Diffusion Models Beat Gans on Image Synthesis","volume-title":"NeurIPS","volume":"34","author":"Dhariwal","year":"2021"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2022.3217852"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00431"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr42600.2020.01005"},{"key":"ref11","first-page":"19652","article-title":"Referring Transformer: A One-step Approach to Multi-task Visual Grounding","volume":"34","author":"Li","year":"2021","journal-title":"NeurIPS"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2021.07.009"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1017\/ATSIP.2020.10"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1007\/s11633-022-1369-5"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.3042066"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2021.04.069"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.524"},{"key":"ref18","first-page":"1780","article-title":"MDETR-Modulated Detection for End-to-End Multi-Modal Under-standing","author":"Kamath","year":"2021","journal-title":"ICCV"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_7"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52729.2023.01451"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2019.2926223"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref24","article-title":"Very Deep Convolutional Networks for Large-Scale Image Recognition","author":"Simonyan","year":"2014","journal-title":"arXiv preprint"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2021.3108500"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR56361.2022.9956163"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1086"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_5"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.9"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2018.8460699"},{"key":"ref32","first-page":"8748","article-title":"Learning Transferable Visual Models From Natural Language Supervision","author":"Radford","year":"2021","journal-title":"ICML"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref34","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2019","journal-title":"NAACL-HLT"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3414006"},{"key":"ref36","article-title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale","author":"Dosovitskiy","year":"2020","journal-title":"ICLR"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475222"},{"key":"ref38","first-page":"667","article-title":"Matterport3D: Learning from RGB-D data in indoor environments","author":"Chang","year":"2018","journal-title":"3DV"}],"event":{"name":"2023 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS)","start":{"date-parts":[[2023,10,1]]},"location":"Detroit, MI, USA","end":{"date-parts":[[2023,10,5]]}},"container-title":["2023 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10341341\/10341342\/10341402.pdf?arnumber=10341402","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,12,20]],"date-time":"2023-12-20T00:13:59Z","timestamp":1703031239000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10341402\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,1]]},"references-count":38,"URL":"https:\/\/doi.org\/10.1109\/iros55552.2023.10341402","relation":{},"subject":[],"published":{"date-parts":[[2023,10,1]]}}}