{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,17]],"date-time":"2026-03-17T18:38:13Z","timestamp":1773772693189,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":39,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Beijing Municipal Science & Technology Commission, Administrative Commission of Zhongguancun Science Park","award":["Z231100005923035"],"award-info":[{"award-number":["Z231100005923035"]}]},{"name":"the Natural Science Foundation of China","award":["62276242"],"award-info":[{"award-number":["62276242"]}]},{"name":"National Aviation Science Foundation","award":["2022Z071078001"],"award-info":[{"award-number":["2022Z071078001"]}]},{"name":"Dreams Foundation of Jianghuai Advance Technology Center","award":["2023-ZM01Z001"],"award-info":[{"award-number":["2023-ZM01Z001"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3688990","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"11407-11413","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":8,"title":["RAG-Guided Large Language Models for Visual Spatial Description with Adaptive Hallucination Corrector"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3197-8103","authenticated-orcid":false,"given":"Jun","family":"Yu","sequence":"first","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1980-7859","authenticated-orcid":false,"given":"Yunxiang","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-1976-0889","authenticated-orcid":false,"given":"Zerui","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-5169-3669","authenticated-orcid":false,"given":"Zhao","family":"Yang","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong University, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-1019-1477","authenticated-orcid":false,"given":"Gongpeng","family":"Zhao","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-1164-0551","authenticated-orcid":false,"given":"Fengzhao","family":"Sun","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1078-430X","authenticated-orcid":false,"given":"Fanrui","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-1208-3031","authenticated-orcid":false,"given":"Qingsong","family":"Liu","sequence":"additional","affiliation":[{"name":"Unisound AI Technology Co., Ltd., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-3598-8564","authenticated-orcid":false,"given":"Jianqing","family":"Sun","sequence":"additional","affiliation":[{"name":"Unisound AI Technology Co., Ltd., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-8309-1301","authenticated-orcid":false,"given":"Jiaen","family":"Liang","sequence":"additional","affiliation":[{"name":"Unisound AI Technology Co., Ltd., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0147-5335","authenticated-orcid":false,"given":"Yaohui","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00522"},{"key":"e_1_3_2_1_2_1","volume-title":"Bidirectional Encoder Representations from Transformers (BERT): A sentiment analysis odyssey. arXiv preprint arXiv:2007.01127","author":"Alaparthi Shivaji","year":"2020","unstructured":"Shivaji Alaparthi and Manit Mishra. 2020. Bidirectional Encoder Representations from Transformers (BERT): A sentiment analysis odyssey. arXiv preprint arXiv:2007.01127 (2020)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01370"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298856"},{"key":"e_1_3_2_1_6_1","volume-title":"InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks. arXiv preprint arXiv:2312.14238","author":"Chen Zhe","year":"2023","unstructured":"Zhe Chen, Jiannan Wu, Wenhai Wang, Weijie Su, Guo Chen, Sen Xing, Muyan Zhong, Qinglong Zhang, Xizhou Zhu, Lewei Lu, Bin Li, Ping Luo, Tong Lu, Yu Qiao, and Jifeng Dai. 2023. InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks. arXiv preprint arXiv:2312.14238 (2023)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00850"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00957"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICITEED.2014.7007894"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2017.2741510"},{"key":"e_1_3_2_1_12_1","volume-title":"Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685","author":"Hu Edward J","year":"2021","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2021. Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)."},{"key":"e_1_3_2_1_13_1","volume-title":"Hou Pong Chan, and Heng Ji","author":"Huang Kung-Hsiang","year":"2023","unstructured":"Kung-Hsiang Huang, Hou Pong Chan, and Heng Ji. 2023. Zero-shot faithful factual error correction. arXiv preprint arXiv:2305.07982 (2023)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00686"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00643"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/2050104.2050105"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","unstructured":"Ranjay Krishna Yuke Zhu Oliver Groth Justin Johnson Kenji Hata Joshua Kravitz Stephanie Chen Yannis Kalantidis Li-Jia Li David A Shamma et al. 2017. Visual genome: Connecting language and vision using crowdsourced dense image annotations. International journal of computer vision 123 (2017) 32--73.","DOI":"10.1007\/s11263-016-0981-7"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01316"},{"key":"e_1_3_2_1_20_1","first-page":"9459","article-title":"Retrieval-augmented generation for knowledge-intensive nlp tasks","volume":"33","author":"Lewis Patrick","year":"2020","unstructured":"Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K\u00fcttler, Mike Lewis, Wen-tau Yih, Tim Rockt\u00e4schel, et al. 2020. Retrieval-augmented generation for knowledge-intensive nlp tasks. Advances in Neural Information Processing Systems 33 (2020), 9459--9474.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00566"},{"key":"e_1_3_2_1_22_1","unstructured":"Haotian Liu Chunyuan Li QingyangWu and Yong Jae Lee. 2023. Visual Instruction Tuning."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3501825"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/S15-2149"},{"key":"e_1_3_2_1_25_1","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J Liu. 2020. Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of machine learning research 21, 140 (2020), 1--67.","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1002\/(SICI)1097-4571(198609)37:5<279::AID-ASI1>3.0.CO;2-Q"},{"key":"e_1_3_2_1_27_1","volume-title":"Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems 28","author":"Ren Shaoqing","year":"2015","unstructured":"Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. 2015. Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems 28 (2015)."},{"key":"e_1_3_2_1_28_1","volume-title":"Quantifying Language Models? Sensitivity to Spurious Features in Prompt Design or: How I learned to start worrying about prompt formatting. arXiv preprint arXiv:2310.11324","author":"Sclar Melanie","year":"2023","unstructured":"Melanie Sclar, Yejin Choi, Yulia Tsvetkov, and Alane Suhr. 2023. Quantifying Language Models? Sensitivity to Spurious Features in Prompt Design or: How I learned to start worrying about prompt formatting. arXiv preprint arXiv:2310.11324 (2023)."},{"key":"e_1_3_2_1_29_1","volume-title":"International Conference on Machine Learning. PMLR, 1611--1619","author":"Song Hyun Oh","year":"2014","unstructured":"Hyun Oh Song, Ross Girshick, Stefanie Jegelka, Julien Mairal, Zaid Harchaoui, and Trevor Darrell. 2014. On learning to localize objects with minimal supervision. In International Conference on Machine Learning. PMLR, 1611--1619."},{"key":"e_1_3_2_1_30_1","volume-title":"Interleaving retrieval with chain-of-thought reasoning for knowledgeintensive multi-step questions. arXiv preprint arXiv:2212.10509","author":"Trivedi Harsh","year":"2022","unstructured":"Harsh Trivedi, Niranjan Balasubramanian, Tushar Khot, and Ashish Sabharwal. 2022. Interleaving retrieval with chain-of-thought reasoning for knowledgeintensive multi-step questions. arXiv preprint arXiv:2212.10509 (2022)."},{"key":"e_1_3_2_1_31_1","volume-title":"Fvqa: Fact-based visual question answering","author":"Wang Peng","year":"2017","unstructured":"Peng Wang, Qi Wu, Chunhua Shen, Anthony Dick, and Anton Van Den Hengel. 2017. Fvqa: Fact-based visual question answering. IEEE transactions on pattern analysis and machine intelligence 40, 10 (2017), 2413--2427."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.29"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00214"},{"key":"e_1_3_2_1_34_1","volume-title":"Woodpecker: Hallucination correction for multimodal large language models. arXiv preprint arXiv:2310.16045","author":"Yin Shukang","year":"2023","unstructured":"Shukang Yin, Chaoyou Fu, Sirui Zhao, Tong Xu, HaoWang, Dianbo Sui, Yunhang Shen, Ke Li, Xing Sun, and Enhong Chen. 2023. Woodpecker: Hallucination correction for multimodal large language models. arXiv preprint arXiv:2310.16045 (2023)."},{"key":"e_1_3_2_1_35_1","volume-title":"Jun Wang, Yiqun Hu, WilliamWang, ZhiguoWang, and Bing Xiang.","author":"Yu Donghan","year":"2022","unstructured":"Donghan Yu, Sheng Zhang, Patrick Ng, Henghui Zhu, Alexander Hanbo Li, Jun Wang, Yiqun Hu, WilliamWang, ZhiguoWang, and Bing Xiang. 2022. Decaf: Joint decoding of answers and logical forms for question answering over knowledge bases. arXiv preprint arXiv:2210.00063 (2022)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1007\/s13042-010-0001-0"},{"key":"e_1_3_2_1_37_1","volume-title":"Generating visual spatial description via holistic 3D scene understanding. arXiv preprint arXiv:2305.11768","author":"Zhao Yu","year":"2023","unstructured":"Yu Zhao, Hao Fei, Wei Ji, Jianguo Wei, Meishan Zhang, Min Zhang, and Tat- Seng Chua. 2023. Generating visual spatial description via holistic 3D scene understanding. arXiv preprint arXiv:2305.11768 (2023)."},{"key":"e_1_3_2_1_38_1","volume-title":"Enhancing the Spatial Awareness Capability of Multi-Modal Large Language Model. arXiv preprint arXiv:2310.20357","author":"Zhao Yongqiang","year":"2023","unstructured":"Yongqiang Zhao, Zhenyu Li, Zhi Jin, Feng Zhang, Haiyan Zhao, Chengfeng Dou, Zhengwei Tao, Xinhai Xu, and Donghong Liu. 2023. Enhancing the Spatial Awareness Capability of Multi-Modal Large Language Model. arXiv preprint arXiv:2310.20357 (2023)."},{"key":"e_1_3_2_1_39_1","volume-title":"Visual spatial description: Controlled spatial-oriented image-to-text generation. arXiv preprint arXiv:2210.11109","author":"Zhao Yu","year":"2022","unstructured":"Yu Zhao, Jianguo Wei, Zhichao Lin, Yueheng Sun, Meishan Zhang, and Min Zhang. 2022. Visual spatial description: Controlled spatial-oriented image-to-text generation. arXiv preprint arXiv:2210.11109 (2022)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3688990","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3688990","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:29Z","timestamp":1750295849000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3688990"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":39,"alternative-id":["10.1145\/3664647.3688990","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3688990","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}