{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,26]],"date-time":"2026-02-26T07:18:00Z","timestamp":1772090280622,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":50,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Hong Kong RIF","award":["Grant No. R6021-20"],"award-info":[{"award-number":["Grant No. R6021-20"]}]},{"name":"the Postdoctoral Fellowship Program of CPSF","award":["Grant Number GZC20232292"],"award-info":[{"award-number":["Grant Number GZC20232292"]}]},{"name":"Hong Kong CRF","award":["Grant No. C2004-21G and C7004-22G"],"award-info":[{"award-number":["Grant No. C2004-21G and C7004-22G"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681022","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"2156-2165","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["3D Question Answering for City Scene Understanding"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-2290-0944","authenticated-orcid":false,"given":"Penglei","family":"Sun","sequence":"first","affiliation":[{"name":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8146-2236","authenticated-orcid":false,"given":"Yaoxian","family":"Song","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-2639-1804","authenticated-orcid":false,"given":"Xiang","family":"Liu","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2458-6774","authenticated-orcid":false,"given":"Xiaofei","family":"Yang","sequence":"additional","affiliation":[{"name":"Guangzhou University, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2986-967X","authenticated-orcid":false,"given":"Qiang","family":"Wang","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Shenzhen, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0265-3454","authenticated-orcid":false,"given":"Tiefeng","family":"Li","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0608-9408","authenticated-orcid":false,"given":"Yang","family":"Yang","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9745-4372","authenticated-orcid":false,"given":"Xiaowen","family":"Chu","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01854"},{"key":"e_1_3_2_1_4_1","volume-title":"Localization, Text Reading, and Beyond. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond. arXiv preprint arXiv:2308.12966 (2023)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1016\/J.ENG.2016.01.003"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00321"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.261"},{"key":"e_1_3_2_1_8_1","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition. 1--10","author":"Das Abhishek","year":"2018","unstructured":"Abhishek Das, Samyak Datta, Georgia Gkioxari, Stefan Lee, Devi Parikh, and Dhruv Batra. 2018. Embodied question answering. In Proceedings of the IEEE conference on computer vision and pattern recognition. 1--10."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01853"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CRV55824.2022.00038"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3210780"},{"key":"e_1_3_2_1_12_1","volume-title":"3DGraphSeg: A unified graph representation-based point cloud segmentation framework for full-range highspeed railway environments","author":"Geng Yixuan","year":"2023","unstructured":"Yixuan Geng, Zhipeng Wang, Limin Jia, Yong Qin, Yuanyuan Chai, Keyan Liu, and Lei Tong. 2023. 3DGraphSeg: A unified graph representation-based point cloud segmentation framework for full-range highspeed railway environments. IEEE Transactions on Industrial Informatics (2023)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41559-020-01358-z"},{"key":"e_1_3_2_1_14_1","volume-title":"Science","volume":"352","author":"Henderson J Vernon","year":"2016","unstructured":"J Vernon Henderson, Anthony J Venables, Tanner Regan, and Ilia Samsonov. 2016. Building functional cities. Science, Vol. 352, 6288 (2016), 946--947."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-021-01554-9"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01057"},{"key":"e_1_3_2_1_17_1","volume-title":"Proceedings of NAACL-HLT. 4171--4186","author":"Ming-Wei Chang Jacob Devlin","year":"2019","unstructured":"Jacob Devlin Ming-Wei Chang Kenton and Lee Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of NAACL-HLT. 4171--4186."},{"key":"e_1_3_2_1_18_1","volume-title":"3-D scene graph: A sparse and semantic representation of physical environments for intelligent agents","author":"Kim Ue-Hwan","year":"2019","unstructured":"Ue-Hwan Kim, Jin-Man Park, Taek-Jin Song, and Jong-Hwan Kim. 2019. 3-D scene graph: A sparse and semantic representation of physical environments for intelligent agents. IEEE transactions on cybernetics, Vol. 50, 12 (2019), 4921--4933."},{"key":"e_1_3_2_1_19_1","volume-title":"Semi-Supervised Classification with Graph Convolutional Networks. In International Conference on Learning Representations.","author":"Kipf Thomas N","year":"2016","unstructured":"Thomas N Kipf and Max Welling. 2016. Semi-Supervised Classification with Graph Convolutional Networks. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA40945.2020.9196558"},{"key":"e_1_3_2_1_21_1","first-page":"1","article-title":"Towards augmented reality driven human-city interaction: Current research on mobile headsets and future challenges","volume":"54","author":"Lee Lik-Hang","year":"2021","unstructured":"Lik-Hang Lee, Tristan Braud, Simo Hosio, and Pan Hui. 2021. Towards augmented reality driven human-city interaction: Current research on mobile headsets and future challenges. ACM Computing Surveys (CSUR), Vol. 54, 8 (2021), 1--38.","journal-title":"ACM Computing Surveys (CSUR)"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6349"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3179507"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20074-8_6"},{"key":"e_1_3_2_1_25_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2024. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_26_1","volume-title":"The Eleventh International Conference on Learning Representations.","author":"Ma Xiaojian","year":"2022","unstructured":"Xiaojian Ma, Silong Yong, Zilong Zheng, Qing Li, Yitao Liang, Song-Chun Zhu, and Siyuan Huang. 2022. SQA3D: Situated Question Answering in 3D Scenes. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_1_27_1","volume-title":"Thirty-seventh Conference on Neural Information Processing Systems Datasets and Benchmarks Track.","author":"Miyanishi Taiki","year":"2023","unstructured":"Taiki Miyanishi, Fumiya Kitamori, Shuhei Kurita, Jungdae Lee, Motoaki Kawanabe, and Nakamasa Inoue. 2023. CityRefer: Geography-aware 3D Visual Grounding Dataset on City-scale Point Cloud Data. In Thirty-seventh Conference on Neural Information Processing Systems Datasets and Benchmarks Track."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00593"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00937"},{"key":"e_1_3_2_1_30_1","volume-title":"Pointnet: Deep hierarchical feature learning on point sets in a metric space. Advances in neural information processing systems","author":"Qi Charles Ruizhongtai","year":"2017","unstructured":"Charles Ruizhongtai Qi, Li Yi, Hao Su, and Leonidas J Guibas. 2017. Pointnet: Deep hierarchical feature learning on point sets in a metric space. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28253"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1410"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jml.2013.09.002"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v31i1.11164"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19824-3_40"},{"key":"e_1_3_2_1_36_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/2629489"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/VR46266.2020.00026"},{"key":"e_1_3_2_1_39_1","volume-title":"LLM-powered Data Augmentation for Enhanced Cross-lingual Performance. In The 2023 Conference on Empirical Methods in Natural Language Processing.","author":"Whitehouse Chenxi","year":"2023","unstructured":"Chenxi Whitehouse, Monojit Choudhury, and Alham Fikri Aji. 2023. LLM-powered Data Augmentation for Enhanced Cross-lingual Performance. In The 2023 Conference on Empirical Methods in Natural Language Processing."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00682"},{"key":"e_1_3_2_1_41_1","volume-title":"Building generalizable agents with a realistic and rich 3d environment. arXiv preprint arXiv:1801.02209","author":"Wu Yi","year":"2018","unstructured":"Yi Wu, Yuxin Wu, Georgia Gkioxari, and Yuandong Tian. 2018. Building generalizable agents with a realistic and rich 3d environment. arXiv preprint arXiv:1801.02209 (2018)."},{"key":"e_1_3_2_1_42_1","first-page":"1","article-title":"A survey of scene graph: Generation and application","volume":"1","author":"Xu Pengfei","year":"2020","unstructured":"Pengfei Xu, Xiaojun Chang, Ling Guo, Po-Yao Huang, Xiaojiang Chen, and Alexander G Hauptmann. 2020. A survey of scene graph: Generation and application. IEEE Trans. Neural Netw. Learn. Syst, Vol. 1 (2020), 1.","journal-title":"IEEE Trans. Neural Netw. Learn. Syst"},{"key":"e_1_3_2_1_43_1","first-page":"1","article-title":"Comprehensive Visual Question Answering on Point Clouds through Compositional Scene Manipulation","volume":"01","author":"Yan Xu","year":"2023","unstructured":"Xu Yan, Zhihao Yuan, Yuhao Du, Yinghong Liao, Yao Guo, Shuguang Cui, and Zhen Li. 2023. Comprehensive Visual Question Answering on Point Clouds through Compositional Scene Manipulation. IEEE Transactions on Visualization & Computer Graphics 01 (2023), 1--13.","journal-title":"IEEE Transactions on Visualization & Computer Graphics"},{"key":"e_1_3_2_1_44_1","volume-title":"UrbanBIS: A Large-Scale Benchmark for Fine-Grained Urban Building Instance Segmentation. In ACM SIGGRAPH 2023 Conference Proceedings. 1--11","author":"Yang Guoqing","year":"2023","unstructured":"Guoqing Yang, Fuyou Xue, Qi Zhang, Ke Xie, Chi-Wing Fu, and Hui Huang. 2023. UrbanBIS: A Large-Scale Benchmark for Fine-Grained Urban Building Instance Segmentation. In ACM SIGGRAPH 2023 Conference Proceedings. 1--11."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2022.3225327"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00647"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3478513.3480483"},{"key":"e_1_3_2_1_48_1","volume-title":"Towards Explainable 3D Grounded Visual Question Answering: A New Benchmark and Strong Baseline","author":"Zhao Lichen","year":"2022","unstructured":"Lichen Zhao, Daigang Cai, Jing Zhang, Lu Sheng, Dong Xu, Rui Zheng, Yinjie Zhao, Lipeng Wang, and Xibo Fan. 2022. Towards Explainable 3D Grounded Visual Question Answering: A New Benchmark and Strong Baseline. IEEE Transactions on Circuits and Systems for Video Technology (2022)."},{"key":"e_1_3_2_1_49_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Zheng Lianmin","year":"2024","unstructured":"Lianmin Zheng, Wei-Lin Chiang, Ying Sheng, Siyuan Zhuang, Zhanghao Wu, Yonghao Zhuang, Zi Lin, Zhuohan Li, Dacheng Li, Eric Xing, et al. 2024. Judging llm-as-a-judge with mt-bench and chatbot arena. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_50_1","volume-title":"Chatgpt asks, blip-2 answers: Automatic questioning towards enriched visual descriptions. arXiv preprint arXiv:2303.06594","author":"Zhu Deyao","year":"2023","unstructured":"Deyao Zhu, Jun Chen, Kilichbek Haydarov, Xiaoqian Shen, Wenxuan Zhang, and Mohamed Elhoseiny. 2023. Chatgpt asks, blip-2 answers: Automatic questioning towards enriched visual descriptions. arXiv preprint arXiv:2303.06594 (2023)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00272"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681022","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681022","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:37Z","timestamp":1750295857000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681022"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":50,"alternative-id":["10.1145\/3664647.3681022","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681022","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}