{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,9]],"date-time":"2026-06-09T16:02:00Z","timestamp":1781020920460,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":63,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["HUST: 2023JYCXJJ009"],"award-info":[{"award-number":["HUST: 2023JYCXJJ009"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Guangdong Basic and Applied Basic Research Foundation","award":["2024A1515010224, 2024A1515030017 and 2024A1515011153"],"award-info":[{"award-number":["2024A1515010224, 2024A1515030017 and 2024A1515011153"]}]},{"name":"China National Natural Science Foundation","award":["No. 62202182, Grant 62176101, No.62276109"],"award-info":[{"award-number":["No. 62202182, Grant 62176101, No.62276109"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681257","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"6617-6626","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":20,"title":["MiniGPT-3D: Efficiently Aligning 3D Point Clouds with Large Language Models using 2D Priors"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6772-9427","authenticated-orcid":false,"given":"Yuan","family":"Tang","sequence":"first","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-1966-2585","authenticated-orcid":false,"given":"Xu","family":"Han","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6835-5607","authenticated-orcid":false,"given":"Xianzhi","family":"Li","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6392-9461","authenticated-orcid":false,"given":"Qiao","family":"Yu","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7296-2522","authenticated-orcid":false,"given":"Yixue","family":"Hao","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6496-6793","authenticated-orcid":false,"given":"Long","family":"Hu","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0960-4447","authenticated-orcid":false,"given":"Min","family":"Chen","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katherine Millican Malcolm Reynolds et al. 2022. Flamingo: a visual language model for few-shot learning. Advances in neural information processing systems Vol. 35 (2022) 23716--23736."},{"key":"e_1_3_2_1_2_1","volume-title":"Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966 (2023)."},{"key":"e_1_3_2_1_3_1","volume-title":"Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65--72","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65--72."},{"key":"e_1_3_2_1_4_1","volume-title":"Honeybee: Locality-enhanced projector for multimodal llm. arXiv preprint arXiv:2312.06742","author":"Cha Junbum","year":"2023","unstructured":"Junbum Cha, Wooyoung Kang, Jonghwan Mun, and Byungseok Roh. 2023. Honeybee: Locality-enhanced projector for multimodal llm. arXiv preprint arXiv:2312.06742 (2023)."},{"key":"e_1_3_2_1_5_1","volume-title":"Sharegpt4v: Improving large multi-modal models with better captions. arXiv preprint arXiv:2311.12793","author":"Chen Lin","year":"2023","unstructured":"Lin Chen, Jisong Li, Xiaoyi Dong, Pan Zhang, Conghui He, Jiaqi Wang, Feng Zhao, and Dahua Lin. 2023. Sharegpt4v: Improving large multi-modal models with better captions. arXiv preprint arXiv:2311.12793 (2023)."},{"key":"e_1_3_2_1_6_1","volume-title":"LL3DA: Visual Interactive Instruction Tuning for Omni-3D Understanding, Reasoning, and Planning. arXiv preprint arXiv:2311.18651","author":"Chen Sijin","year":"2023","unstructured":"Sijin Chen, Xin Chen, Chi Zhang, Mingsheng Li, Gang Yu, Hao Fei, Hongyuan Zhu, Jiayuan Fan, and Tao Chen. 2023. LL3DA: Visual Interactive Instruction Tuning for Omni-3D Understanding, Reasoning, and Planning. arXiv preprint arXiv:2311.18651 (2023)."},{"key":"e_1_3_2_1_7_1","volume-title":"Octavius: Mitigating task interference in mllms via moe. arXiv preprint arXiv:2311.02684","author":"Chen Zeren","year":"2023","unstructured":"Zeren Chen, Ziqin Wang, Zhen Wang, Huayang Liu, Zhenfei Yin, Si Liu, Lu Sheng, Wanli Ouyang, Yu Qiao, and Jing Shao. 2023. Octavius: Mitigating task interference in mllms via moe. arXiv preprint arXiv:2311.02684 (2023)."},{"key":"e_1_3_2_1_8_1","volume-title":"2023 d. Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks. arXiv preprint arXiv:2312.14238","author":"Chen Zhe","year":"2023","unstructured":"Zhe Chen, Jiannan Wu, Wenhai Wang, Weijie Su, Guo Chen, Sen Xing, Zhong Muyan, Qinglong Zhang, Xizhou Zhu, Lewei Lu, et al. 2023 d. Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks. arXiv preprint arXiv:2312.14238 (2023)."},{"key":"e_1_3_2_1_9_1","volume-title":"Mobilevlm: A fast, reproducible and strong vision language assistant for mobile devices. arXiv preprint arXiv:2312.16886","author":"Chu Xiangxiang","year":"2023","unstructured":"Xiangxiang Chu, Limeng Qiao, Xinyang Lin, Shuang Xu, Yang Yang, Yiming Hu, Fei Wei, Xinyu Zhang, Bo Zhang, Xiaolin Wei, et al. 2023. Mobilevlm: A fast, reproducible and strong vision language assistant for mobile devices. arXiv preprint arXiv:2312.16886 (2023)."},{"key":"e_1_3_2_1_10_1","unstructured":"Xiangxiang Chu Limeng Qiao Xinyu Zhang Shuang Xu Fei Wei Yang Yang Xiaofei Sun Yiming Hu Xinyang Lin Bo Zhang et al. 2024. MobileVLM V2: Faster and Stronger Baseline for Vision Language Model. arXiv preprint arXiv:2402.03766 (2024)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACVW60836.2024.00101"},{"key":"e_1_3_2_1_12_1","volume-title":"Junqi Zhao, Weisheng Wang, Boyang Li, Pascale N Fung, and Steven Hoi.","author":"Dai Wenliang","year":"2024","unstructured":"Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale N Fung, and Steven Hoi. 2024. Instructblip: Towards general-purpose vision-language models with instruction tuning. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01263"},{"key":"e_1_3_2_1_14_1","volume-title":"Loramoe: Revolutionizing mixture of experts for maintaining world knowledge in language model alignment. arXiv preprint arXiv:2312.09979","author":"Dou Shihan","year":"2023","unstructured":"Shihan Dou, Enyu Zhou, Yan Liu, Songyang Gao, Jun Zhao, Wei Shen, Yuhao Zhou, Zhiheng Xi, Xiao Wang, Xiaoran Fan, et al. 2023. Loramoe: Revolutionizing mixture of experts for maintaining world knowledge in language model alignment. arXiv preprint arXiv:2312.09979 (2023)."},{"key":"e_1_3_2_1_15_1","first-page":"1","article-title":"Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity","volume":"23","author":"Fedus William","year":"2022","unstructured":"William Fedus, Barret Zoph, and Noam Shazeer. 2022. Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity. Journal of Machine Learning Research, Vol. 23, 120 (2022), 1--39.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACVW60836.2024.00102"},{"key":"e_1_3_2_1_17_1","volume-title":"Higher Layers Need More LoRA Experts. arXiv preprint arXiv:2402.08562","author":"Gao Chongyang","year":"2024","unstructured":"Chongyang Gao, Kezhen Chen, Jinmeng Rao, Baochen Sun, Ruibo Liu, Daiyi Peng, Yawen Zhang, Xiaoyuan Guo, Jie Yang, and VS Subrahmanian. 2024. Higher Layers Need More LoRA Experts. arXiv preprint arXiv:2402.08562 (2024)."},{"key":"e_1_3_2_1_18_1","volume-title":"Simcse: Simple contrastive learning of sentence embeddings. arXiv preprint arXiv:2104.08821","author":"Gao Tianyu","year":"2021","unstructured":"Tianyu Gao, Xingcheng Yao, and Danqi Chen. 2021. Simcse: Simple contrastive learning of sentence embeddings. arXiv preprint arXiv:2104.08821 (2021)."},{"key":"e_1_3_2_1_19_1","unstructured":"Ziyu Guo Renrui Zhang Xiangyang Zhu Yiwen Tang Xianzheng Ma Jiaming Han Kexin Chen Peng Gao Xianzhi Li Hongsheng Li et al. 2023. Point-bind & point-llm: Aligning point cloud with multi-modality for 3d understanding generation and instruction following. arXiv preprint arXiv:2309.00615 (2023)."},{"key":"e_1_3_2_1_20_1","volume-title":"Onellm: One framework to align all modalities with language. arXiv preprint arXiv:2312.03700","author":"Han Jiaming","year":"2023","unstructured":"Jiaming Han, Kaixiong Gong, Yiyuan Zhang, Jiaqi Wang, Kaipeng Zhang, Dahua Lin, Yu Qiao, Peng Gao, and Xiangyu Yue. 2023. Onellm: One framework to align all modalities with language. arXiv preprint arXiv:2312.03700 (2023)."},{"key":"e_1_3_2_1_21_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Hong Yining","year":"2024","unstructured":"Yining Hong, Haoyu Zhen, Peihao Chen, Shuhong Zheng, Yilun Du, Zhenfang Chen, and Chuang Gan. 2024. 3d-llm: Injecting the 3d world into large language models. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_22_1","volume-title":"Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685","author":"Hu Edward J","year":"2021","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2021. Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)."},{"key":"e_1_3_2_1_23_1","volume-title":"Adaptive mixtures of local experts. Neural computation","author":"Jacobs Robert A","year":"1991","unstructured":"Robert A Jacobs, Michael I Jordan, Steven J Nowlan, and Geoffrey E Hinton. 1991. Adaptive mixtures of local experts. Neural computation, Vol. 3, 1 (1991), 79--87."},{"key":"e_1_3_2_1_24_1","volume-title":"Elevating 3D Representation with Joint Multi-modal Cues. arXiv preprint arXiv:2310.09503","author":"Ji Jiayi","year":"2023","unstructured":"Jiayi Ji, Haowei Wang, Changli Wu, Yiwei Ma, Xiaoshuai Sun, and Rongrong Ji. 2023. JM3D & JM3D-LLM: Elevating 3D Representation with Joint Multi-modal Cues. arXiv preprint arXiv:2310.09503 (2023)."},{"key":"e_1_3_2_1_25_1","volume-title":"Hierarchical mixtures of experts and the EM algorithm. Neural computation","author":"Jordan Michael I","year":"1994","unstructured":"Michael I Jordan and Robert A Jacobs. 1994. Hierarchical mixtures of experts and the EM algorithm. Neural computation, Vol. 6, 2 (1994), 181--214."},{"key":"e_1_3_2_1_26_1","volume-title":"Beyond distillation: Task-level mixture-of-experts for efficient inference. arXiv preprint arXiv:2110.03742","author":"Kudugunta Sneha","year":"2021","unstructured":"Sneha Kudugunta, Yanping Huang, Ankur Bapna, Maxim Krikun, Dmitry Lepikhin, Minh-Thang Luong, and Orhan Firat. 2021. Beyond distillation: Task-level mixture-of-experts for efficient inference. arXiv preprint arXiv:2110.03742 (2021)."},{"key":"e_1_3_2_1_27_1","volume-title":"Gshard: Scaling giant models with conditional computation and automatic sharding. arXiv preprint arXiv:2006.16668","author":"Lepikhin Dmitry","year":"2020","unstructured":"Dmitry Lepikhin, HyoukJoong Lee, Yuanzhong Xu, Dehao Chen, Orhan Firat, Yanping Huang, Maxim Krikun, Noam Shazeer, and Zhifeng Chen. 2020. Gshard: Scaling giant models with conditional computation and automatic sharding. arXiv preprint arXiv:2006.16668 (2020)."},{"key":"e_1_3_2_1_28_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730--19742."},{"key":"e_1_3_2_1_29_1","volume-title":"Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74--81.","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74--81."},{"key":"e_1_3_2_1_30_1","volume-title":"Uni3D-LLM: Unifying Point Cloud Perception, Generation and Editing with Large Language Models. arXiv preprint arXiv:2402.03327","author":"Liu Dingning","year":"2024","unstructured":"Dingning Liu, Xiaoshui Huang, Yuenan Hou, Zhihui Wang, Zhenfei Yin, Yongshun Gong, Peng Gao, and Wanli Ouyang. 2024. Uni3D-LLM: Unifying Point Cloud Perception, Generation and Editing with Large Language Models. arXiv preprint arXiv:2402.03327 (2024)."},{"key":"e_1_3_2_1_31_1","volume-title":"Aligning large multi-modal model with robust instruction tuning. arXiv preprint arXiv:2306.14565","author":"Liu Fuxiao","year":"2023","unstructured":"Fuxiao Liu, Kevin Lin, Linjie Li, Jianfeng Wang, Yaser Yacoob, and Lijuan Wang. 2023. Aligning large multi-modal model with robust instruction tuning. arXiv preprint arXiv:2306.14565 (2023)."},{"key":"e_1_3_2_1_32_1","volume-title":"Improved baselines with visual instruction tuning. arXiv preprint arXiv:2310.03744","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Yuheng Li, and Yong Jae Lee. 2023. Improved baselines with visual instruction tuning. arXiv preprint arXiv:2310.03744 (2023)."},{"key":"e_1_3_2_1_33_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2024. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_34_1","unstructured":"Abdin Marah Aneja Jyoti Bubeck Sebastien C\u00e9sar Teodoro Mendes Caio Chen Weizhu Del Giorno Allie Eldan Ronen Gopi Sivakanth Gunasekar Suriya Javaheripi Mojan Kauffmann Piero Tat Lee Yin Li Yuanzhi Nguyen Anh de Rosa Gustavo Saarikivi Olli Salim Adil Shah Shital Santacroce Michael Singh Behl Harkirat Taumann Kalai Adam Wang Xin Ward Rachel Witte Philipp Zhang Cyril and Zhang Yi. 2023. Phi-2: The surprising power of small language models. https:\/\/www.microsoft.com\/en-us\/research\/blog\/phi-2-the-surprising-power-of-small-language-models\/ [Online; accessed 18-March-2024]."},{"key":"e_1_3_2_1_35_1","unstructured":"OpenAI. 2022. ChatGPT. https:\/\/openai.com\/blog\/chatgpt."},{"key":"e_1_3_2_1_36_1","volume-title":"View in Article","volume":"2","author":"R","year":"2023","unstructured":"R OpenAI. 2023. Gpt-4 technical report. arxiv 2303.08774. View in Article, Vol. 2, 5 (2023)."},{"key":"e_1_3_2_1_37_1","volume-title":"X-InstructBLIP: A Framework for aligning X-Modal instruction-aware representations to LLMs and Emergent Cross-modal Reasoning. arXiv preprint arXiv:2311.18799","author":"Panagopoulou Artemis","year":"2023","unstructured":"Artemis Panagopoulou, Le Xue, Ning Yu, Junnan Li, Dongxu Li, Shafiq Joty, Ran Xu, Silvio Savarese, Caiming Xiong, and Juan Carlos Niebles. 2023. X-InstructBLIP: A Framework for aligning X-Modal instruction-aware representations to LLMs and Emergent Cross-modal Reasoning. arXiv preprint arXiv:2311.18799 (2023)."},{"key":"e_1_3_2_1_38_1","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311--318","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311--318."},{"key":"e_1_3_2_1_39_1","volume-title":"From sparse to soft mixtures of experts. arXiv preprint arXiv:2308.00951","author":"Puigcerver Joan","year":"2023","unstructured":"Joan Puigcerver, Carlos Riquelme, Basil Mustafa, and Neil Houlsby. 2023. From sparse to soft mixtures of experts. arXiv preprint arXiv:2308.00951 (2023)."},{"key":"e_1_3_2_1_40_1","volume-title":"ShapeLLM: Universal 3D Object Understanding for Embodied Interaction. arXiv preprint arXiv:2402.17766","author":"Qi Zekun","year":"2024","unstructured":"Zekun Qi, Runpei Dong, Shaochen Zhang, Haoran Geng, Chunrui Han, Zheng Ge, Li Yi, and Kaisheng Ma. 2024. ShapeLLM: Universal 3D Object Understanding for Embodied Interaction. arXiv preprint arXiv:2402.17766 (2024)."},{"key":"e_1_3_2_1_41_1","volume-title":"GPT4Point: A Unified Framework for Point-Language Understanding and Generation. arXiv preprint arXiv:2312.02980","author":"Qi Zhangyang","year":"2023","unstructured":"Zhangyang Qi, Ye Fang, Zeyi Sun, Xiaoyang Wu, Tong Wu, Jiaqi Wang, Dahua Lin, and Hengshuang Zhao. 2023. GPT4Point: A Unified Framework for Point-Language Understanding and Generation. arXiv preprint arXiv:2312.02980 (2023)."},{"key":"e_1_3_2_1_42_1","volume-title":"Sentence-bert: Sentence embeddings using siamese bert-networks. arXiv preprint arXiv:1908.10084","author":"Reimers Nils","year":"2019","unstructured":"Nils Reimers and Iryna Gurevych. 2019. Sentence-bert: Sentence embeddings using siamese bert-networks. arXiv preprint arXiv:1908.10084 (2019)."},{"key":"e_1_3_2_1_43_1","first-page":"8583","article-title":"Scaling vision with sparse mixture of experts","volume":"34","author":"Riquelme Carlos","year":"2021","unstructured":"Carlos Riquelme, Joan Puigcerver, Basil Mustafa, Maxim Neumann, Rodolphe Jenatton, Andr\u00e9 Susano Pinto, Daniel Keysers, and Neil Houlsby. 2021. Scaling vision with sparse mixture of experts. Advances in Neural Information Processing Systems, Vol. 34 (2021), 8583--8595.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_44_1","volume-title":"Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. arXiv preprint arXiv:1701.06538","author":"Shazeer Noam","year":"2017","unstructured":"Noam Shazeer, Azalia Mirhoseini, Krzysztof Maziarz, Andy Davis, Quoc Le, Geoffrey Hinton, and Jeff Dean. 2017. Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. arXiv preprint arXiv:1701.06538 (2017)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10161317"},{"key":"e_1_3_2_1_46_1","volume-title":"Kabilan Elangovan, Laura Gutierrez, Ting Fang Tan, and Daniel Shu Wei Ting.","author":"Thirunavukarasu Arun James","year":"2023","unstructured":"Arun James Thirunavukarasu, Darren Shu Jeng Ting, Kabilan Elangovan, Laura Gutierrez, Ting Fang Tan, and Daniel Shu Wei Ting. 2023. Large language models in medicine. Nature medicine, Vol. 29, 8 (2023), 1930--1940."},{"key":"e_1_3_2_1_47_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_48_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_49_1","volume-title":"Large language models for multi-modal human-robot interaction. arXiv preprint arXiv:2401.15174","author":"Wang Chao","year":"2024","unstructured":"Chao Wang, Stephan Hasler, Daniel Tanneberg, Felix Ocker, Frank Joublin, Antonello Ceravola, Joerg Deigmoeller, and Michael Gienger. 2024. Large language models for multi-modal human-robot interaction. arXiv preprint arXiv:2401.15174 (2024)."},{"key":"e_1_3_2_1_50_1","first-page":"24824","article-title":"Chain-of-thought prompting elicits reasoning in large language models","volume":"35","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al. 2022. Chain-of-thought prompting elicits reasoning in large language models. Advances in Neural Information Processing Systems, Vol. 35 (2022), 24824--24837.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_51_1","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition. 1912--1920","author":"Wu Zhirong","year":"2015","unstructured":"Zhirong Wu, Shuran Song, Aditya Khosla, Fisher Yu, Linguang Zhang, Xiaoou Tang, and Jianxiong Xiao. 2015. 3d shapenets: A deep representation for volumetric shapes. In Proceedings of the IEEE conference on computer vision and pattern recognition. 1912--1920."},{"key":"e_1_3_2_1_52_1","volume-title":"Pointllm: Empowering large language models to understand point clouds. arXiv preprint arXiv:2308.16911","author":"Xu Runsen","year":"2023","unstructured":"Runsen Xu, Xiaolong Wang, Tai Wang, Yilun Chen, Jiangmiao Pang, and Dahua Lin. 2023. Pointllm: Empowering large language models to understand point clouds. arXiv preprint arXiv:2308.16911 (2023)."},{"key":"e_1_3_2_1_53_1","volume-title":"Juan Carlos Niebles, and Silvio Savarese","author":"Xue Le","year":"2023","unstructured":"Le Xue, Ning Yu, Shu Zhang, Junnan Li, Roberto Mart\u00edn-Mart\u00edn, Jiajun Wu, Caiming Xiong, Ran Xu, Juan Carlos Niebles, and Silvio Savarese. 2023. Ulip-2: Towards scalable multimodal pre-training for 3d understanding. arXiv preprint arXiv:2305.08275 (2023)."},{"key":"e_1_3_2_1_54_1","unstructured":"Aiyuan Yang Bin Xiao Bingning Wang Borong Zhang Ce Bian Chao Yin Chenxu Lv Da Pan Dian Wang Dong Yan et al. 2023. Baichuan 2: Open large-scale language models. arXiv preprint arXiv:2309.10305 (2023)."},{"key":"e_1_3_2_1_55_1","volume-title":"Lidar-llm: Exploring the potential of large language models for 3d lidar understanding. arXiv preprint arXiv:2312.14074","author":"Yang Senqiao","year":"2023","unstructured":"Senqiao Yang, Jiaming Liu, Ray Zhang, Mingjie Pan, Zoey Guo, Xiaoqi Li, Zehui Chen, Peng Gao, Yandong Guo, and Shanghang Zhang. 2023. Lidar-llm: Exploring the potential of large language models for 3d lidar understanding. arXiv preprint arXiv:2312.14074 (2023)."},{"key":"e_1_3_2_1_56_1","unstructured":"Qinghao Ye Haiyang Xu Guohai Xu Jiabo Ye Ming Yan Yiyang Zhou Junyang Wang Anwen Hu Pengcheng Shi Yaya Shi et al. 2023. mplug-owl: Modularization empowers large language models with multimodality. arXiv preprint arXiv:2304.14178 (2023)."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01871"},{"key":"e_1_3_2_1_58_1","volume-title":"Tinygpt-v: Efficient multimodal large language model via small backbones. arXiv preprint arXiv:2312.16862","author":"Yuan Zhengqing","year":"2023","unstructured":"Zhengqing Yuan, Zhaoxu Li, and Lichao Sun. 2023. Tinygpt-v: Efficient multimodal large language model via small backbones. arXiv preprint arXiv:2312.16862 (2023)."},{"key":"e_1_3_2_1_59_1","volume-title":"Pushing mixture of experts to the limit: Extremely parameter efficient moe for instruction tuning. arXiv preprint arXiv:2309.05444","author":"Zadouri Ted","year":"2023","unstructured":"Ted Zadouri, Ahmet \u00dcst\u00fcn, Arash Ahmadian, Beyza Ermics, Acyr Locatelli, and Sara Hooker. 2023. Pushing mixture of experts to the limit: Extremely parameter efficient moe for instruction tuning. arXiv preprint arXiv:2309.05444 (2023)."},{"key":"e_1_3_2_1_60_1","volume-title":"Yuhang Cao, Chao Xu, Linke Ouyang, Zhiyuan Zhao, Shuangrui Ding, Songyang Zhang, Haodong Duan, Hang Yan, et al.","author":"Zhang Pan","year":"2023","unstructured":"Pan Zhang, Xiaoyi Dong Bin Wang, Yuhang Cao, Chao Xu, Linke Ouyang, Zhiyuan Zhao, Shuangrui Ding, Songyang Zhang, Haodong Duan, Hang Yan, et al. 2023. Internlm-xcomposer: A vision-language large model for advanced text-image comprehension and composition. arXiv preprint arXiv:2309.15112 (2023)."},{"key":"e_1_3_2_1_61_1","volume-title":"Tinyllama: An open-source small language model. arXiv preprint arXiv:2401.02385","author":"Zhang Peiyuan","year":"2024","unstructured":"Peiyuan Zhang, Guangtao Zeng, Tianduo Wang, and Wei Lu. 2024. Tinyllama: An open-source small language model. arXiv preprint arXiv:2401.02385 (2024)."},{"key":"e_1_3_2_1_62_1","unstructured":"Yanzhe Zhang Ruiyi Zhang Jiuxiang Gu Yufan Zhou Nedim Lipka Diyi Yang and Tong Sun. 2023. Enhanced visual instruction tuning for text-rich image understanding. (2023)."},{"key":"e_1_3_2_1_63_1","volume-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592","author":"Zhu Deyao","year":"2023","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2023. Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681257","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681257","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:42Z","timestamp":1750295862000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681257"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":63,"alternative-id":["10.1145\/3664647.3681257","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681257","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}