{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,20]],"date-time":"2026-05-20T16:25:40Z","timestamp":1779294340044,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,9]],"date-time":"2024-10-09T00:00:00Z","timestamp":1728432000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Guangzhou-HKUST(GZ) Joint Funding Project","award":["2024A03J0617"],"award-info":[{"award-number":["2024A03J0617"]}]},{"name":"Guangzhou Science and Technology Program City-University Joint Funding Project","award":["2023A03J0001"],"award-info":[{"award-number":["2023A03J0001"]}]},{"name":"HKUST Practice Research with Project title RBM talent cultivation Exploration","award":["HKUST(GZ)-ROP2023030"],"award-info":[{"award-number":["HKUST(GZ)-ROP2023030"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,9]]},"DOI":"10.1145\/3641825.3687742","type":"proceedings-article","created":{"date-parts":[[2024,9,22]],"date-time":"2024-09-22T04:26:46Z","timestamp":1726979206000},"page":"1-14","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Toward Facilitating Search in VR With the Assistance of Vision Large Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-6735-3492","authenticated-orcid":false,"given":"Chao","family":"Liu","sequence":"first","affiliation":[{"name":"Computational Media and Arts Thrust, The Hong Kong University of Science and Technology (Guangzhou), China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-6233-3884","authenticated-orcid":false,"given":"Chi San (Clarence)","family":"Cheung","sequence":"additional","affiliation":[{"name":"Academy of Interdisciplinary Studies, The Hong Kong University of Science and Technology, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2447-8793","authenticated-orcid":false,"given":"Mingqing","family":"Xu","sequence":"additional","affiliation":[{"name":"Computational Media and Arts Thrust, The Hong Kong University of Science and Technology (Guangzhou), China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-1232-004X","authenticated-orcid":false,"given":"Zhongyue","family":"Zhang","sequence":"additional","affiliation":[{"name":"Computational Media and Arts Thrust, The Hong Kong University of Science and Technology (Guangzhou), China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-3281-7594","authenticated-orcid":false,"given":"Mingyang","family":"Su","sequence":"additional","affiliation":[{"name":"Tsinghua Shenzhen International Graduate School, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0356-4712","authenticated-orcid":false,"given":"Mingming","family":"Fan","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology (Guangzhou), China and The Hong Kong University of Science and Technology, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,9]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3613905.3650815"},{"key":"e_1_3_2_1_2_1","unstructured":"Jinze Bai Shuai Bai Shusheng Yang Shijie Wang Sinan Tan Peng Wang Junyang Lin Chang Zhou and Jingren Zhou. 2023. Qwen-VL: A Versatile Vision-Language Model for Understanding Localization Text Reading and Beyond. arxiv:2308.12966\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2308.12966"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","unstructured":"Yejin Bang Samuel Cahyawijaya Nayeon Lee Wenliang Dai Dan Su Bryan Wilie Holy Lovenia Ziwei Ji Tiehzheng Yu Willy Chung Quyet Do Xu Yan and Pascale Fung. 2023. A Multitask Multilingual Multimodal Evaluation of ChatGPT on Reasoning Hallucination and Interactivity. 675\u2013718. https:\/\/doi.org\/10.18653\/v1\/2023.ijcnlp-main.45","DOI":"10.18653\/v1"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/1240624.1240789"},{"key":"e_1_3_2_1_5_1","volume-title":"Shikra: Unleashing Multimodal LLM\u2019s Referential Dialogue Magic.","author":"Chen Keqin","year":"2023","unstructured":"Keqin Chen, Zhao Zhang, Weili Zeng, Richong Zhang, Feng Zhu, and Rui Zhao. 2023. Shikra: Unleashing Multimodal LLM\u2019s Referential Dialogue Magic."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","unstructured":"Taizhou Chen Yi-Shiun Wu and Zhu Kening. 2018. Investigating different modalities of directional cues for multi-task visual-searching scenario in virtual reality. 1\u20135. https:\/\/doi.org\/10.1145\/3281505.3281516","DOI":"10.1145\/3281505.3281516"},{"key":"e_1_3_2_1_7_1","unstructured":"Cathy Edwards. 2024. Circle (or highlight or scribble) to Search. Blog Post. https:\/\/blog.google\/products\/search\/google-circle-to-search-android\/ Accessed: 2024-05-19."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2024.3372045"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/MIS.2015.97"},{"key":"e_1_3_2_1_10_1","unstructured":"Google. 2024. The Circle of Life: Bringing Google Search to Android. https:\/\/blog.google\/products\/search\/google-circle-to-search-android\/. Accessed: 2024-08-13."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/VR51125.2022.00032"},{"key":"e_1_3_2_1_12_1","unstructured":"Yining Hong Haoyu Zhen Peihao Chen Shuhong Zheng Yilun Du Zhenfang Chen and Chuang Gan. 2023. 3D-LLM: Injecting the 3D World into Large Language Models. arxiv:2307.12981\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2307.12981"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.20380\/GI2021.32"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3517696"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3571730"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2407.02409"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474661"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3025453.3025684"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/2505515.2505663"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Christian Ledig Lucas Theis Ferenc Huszar Jose Caballero Andrew Cunningham Alejandro Acosta Andrew Aitken Alykhan Tejani Johannes Totz Zehan Wang and Wenzhe Shi. 2017. Photo-Realistic Single Image Super-Resolution Using a Generative Adversarial Network. arxiv:1609.04802\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/1609.04802","DOI":"10.1109\/CVPR.2017.19"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642100"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2402.1942"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1002\/acp.1002"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/VR.2019.8797891"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2017.2657238"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2024.3352100"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.7315\/CADCAM.2016.020"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1115\/DETC2018-85867"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3411763.3451766"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445606"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2013.2268052"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0246398"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0246398"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642459"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/1076034.1076045"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICICAT57735.2023.10263706"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3363384.3363402"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","unstructured":"Ashok Veilumuthu and Parthasarathy Ramachandran. 2007. Discovering Implicit Feedbacks from Search Engine Log Files. (2007) 231\u2013242. https:\/\/doi.org\/10.1007\/978-3-540-75488-6_22","DOI":"10.1007\/978-3-540-75488-6_22"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3642979.3643006"},{"key":"e_1_3_2_1_40_1","volume-title":"Visionllm: Large language model is also an open-ended decoder for vision-centric tasks. Advances in Neural Information Processing Systems 36","author":"Wang Wenhai","year":"2024","unstructured":"Wenhai Wang, Zhe Chen, Xiaokang Chen, Jiannan Wu, Xizhou Zhu, Gang Zeng, Ping Luo, Tong Lu, Jie Zhou, Yu Qiao, 2024. Visionllm: Large language model is also an open-ended decoder for vision-centric tasks. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3539618.3592057"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3397271.3401303"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","unstructured":"Austin Ward Yiyin Gu Sandeep Avula and Praneeth Chakravarthy. 2021. Interacting with Information in Immersive Virtual Environments. 2600\u20132604. https:\/\/doi.org\/10.1145\/3404835.3462787","DOI":"10.1145\/3404835.3462787"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/VR46266.2020.00097"},{"key":"e_1_3_2_1_45_1","unstructured":"Penghao Wu and Saining Xie. 2023. V*: Guided Visual Search as a Core Mechanism in Multimodal LLMs. arxiv:2312.14135\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2312.14135"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3597503.3623326"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581500"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.icte.2023.12.006"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3613905.3648648"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","unstructured":"Andrew Zhou and Grace Yang. 2018. Minority Report by Lemur: Supporting Search Engine with Virtual Reality. 1329\u20131332. https:\/\/doi.org\/10.1145\/3209978.3210179","DOI":"10.1145\/3209978.3210179"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3450615.3464546"}],"event":{"name":"VRST '24: 30th ACM Symposium on Virtual Reality Software and Technology","location":"Trier Germany","acronym":"VRST '24","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques","SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["30th ACM Symposium on Virtual Reality Software and Technology"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3641825.3687742","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:04:02Z","timestamp":1750291442000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3641825.3687742"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,9]]},"references-count":51,"alternative-id":["10.1145\/3641825.3687742","10.1145\/3641825"],"URL":"https:\/\/doi.org\/10.1145\/3641825.3687742","relation":{},"subject":[],"published":{"date-parts":[[2024,10,9]]},"assertion":[{"value":"2024-10-09","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}