{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,21]],"date-time":"2026-02-21T20:22:15Z","timestamp":1771705335734,"version":"3.50.1"},"reference-count":18,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,7,15]],"date-time":"2024-07-15T00:00:00Z","timestamp":1721001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,7,15]],"date-time":"2024-07-15T00:00:00Z","timestamp":1721001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,7,15]]},"DOI":"10.1109\/icmew63481.2024.10645462","type":"proceedings-article","created":{"date-parts":[[2024,8,29]],"date-time":"2024-08-29T17:43:36Z","timestamp":1724953416000},"page":"1-5","source":"Crossref","is-referenced-by-count":23,"title":["3DMIT: 3D Multi-Modal Instruction Tuning for Scene Understanding"],"prefix":"10.1109","author":[{"given":"Zeju","family":"Li","sequence":"first","affiliation":[{"name":"Beijing Digital Native Digital City Research Center"}]},{"given":"Chao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Beijing Digital Native Digital City Research Center"}]},{"given":"Xiaoyan","family":"Wang","sequence":"additional","affiliation":[{"name":"Beijing Digital Native Digital City Research Center"}]},{"given":"Ruilong","family":"Ren","sequence":"additional","affiliation":[{"name":"Beijing Digital Native Digital City Research Center"}]},{"given":"Yifan","family":"Xu","sequence":"additional","affiliation":[{"name":"Beijing Digital Native Digital City Research Center"}]},{"given":"Ruifei","family":"Ma","sequence":"additional","affiliation":[{"name":"Beijing Digital Native Digital City Research Center"}]},{"given":"Xiangde","family":"Liu","sequence":"additional","affiliation":[{"name":"Beijing Digital Native Digital City Research Center"}]},{"given":"Rong","family":"Wei","sequence":"additional","affiliation":[{"name":"Beijing Digital Native Digital City Research Center"}]}],"member":"263","reference":[{"key":"ref1","author":"Touvron","journal-title":"Llama: Open and efficient foundation language models"},{"key":"ref2","article-title":"Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality","volume-title":"See https:\/\/vicuna. lmsys. org (accessed 14 April 2023)","author":"Chiang"},{"key":"ref3","author":"Hong","journal-title":"3d-llm: Injecting the 3D world into large language models"},{"key":"ref4","author":"Yin","year":"2023","journal-title":"Lamm: Language-assisted multi-modal instruction-tuning dataset, framework, and benchmark"},{"key":"ref5","author":"Wang","year":"2023","journal-title":"Chat-3d: Data-efficiently tuning large language model for universal dialogue of 3D scenes"},{"key":"ref6","author":"Liu","journal-title":"Improved baselines with visual instruction tuning"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.261"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58565-5_13"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i3.28013"},{"key":"ref10","author":"Xue","journal-title":"Ulip-2: Towards scalable multimodal pre-training for 3D understanding"},{"key":"ref11","author":"Zhou","journal-title":"Uni3d: Exploring unified 3D representation at scale"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01854"},{"key":"ref13","author":"Han","journal-title":"Imagebind-llm: Multi-modality instruction tuning"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-32248-9_23"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00644"},{"key":"ref16","author":"Liu","year":"2023","journal-title":"Visual instruction tuning"},{"key":"ref17","author":"Hu","journal-title":"Lora: Low-rank adaptation of large language models"},{"key":"ref18","author":"Yang","year":"2023","journal-title":"Llm-grounder: Open-vocabulary 3D visual grounding with large language model as an agent"}],"event":{"name":"2024 IEEE International Conference on Multimedia and Expo Workshops (ICMEW)","location":"Niagara Falls, ON, Canada","start":{"date-parts":[[2024,7,15]]},"end":{"date-parts":[[2024,7,19]]}},"container-title":["2024 IEEE International Conference on Multimedia and Expo Workshops (ICMEW)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10645349\/10645352\/10645462.pdf?arnumber=10645462","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,31]],"date-time":"2024-08-31T04:43:55Z","timestamp":1725079435000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10645462\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,7,15]]},"references-count":18,"URL":"https:\/\/doi.org\/10.1109\/icmew63481.2024.10645462","relation":{},"subject":[],"published":{"date-parts":[[2024,7,15]]}}}