{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,3]],"date-time":"2026-06-03T04:04:05Z","timestamp":1780459445012,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T00:00:00Z","timestamp":1776038400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"National Natural Science Foundation of China","award":["Grant No.6237 2408"],"award-info":[{"award-number":["Grant No.6237 2408"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,13]]},"DOI":"10.1145\/3800424.3800448","type":"proceedings-article","created":{"date-parts":[[2026,6,3]],"date-time":"2026-06-03T03:27:31Z","timestamp":1780457251000},"page":"90-101","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Bridging the Semantic Void: A Dual-branch RAG Framework to Enhance GUI Component Description for BVI Users"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-0189-4011","authenticated-orcid":false,"given":"Yuxuan","family":"Wu","sequence":"first","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3645-1041","authenticated-orcid":false,"given":"Sheng","family":"Zhou","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4479-3738","authenticated-orcid":false,"given":"Ziwei","family":"Wang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1991-0429","authenticated-orcid":false,"given":"Liangcheng","family":"Li","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1097-2044","authenticated-orcid":false,"given":"Jiajun","family":"Bu","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,2]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"crossref","unstructured":"Muna Al-Razgan Sarah Almoaiqel Nuha Alrajhi Alyah Alhumegani Abeer Alshehri Bashayr Alnefaie Raghad AlKhamiss and Shahad Rushdi. 2021. A systematic literature review on the usability of mobile applications for visually impaired users. PeerJ Computer Science 7 (2021) e771.","DOI":"10.7717\/peerj-cs.771"},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"crossref","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katherine Millican Malcolm Reynolds et\u00a0al. 2022. Flamingo: a visual language model for few-shot learning. Advances in neural information processing systems 35 (2022) 23716\u201323736.","DOI":"10.52202\/068431-1723"},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"publisher","DOI":"10.1145\/2351676.2351717"},{"key":"e_1_3_3_2_5_2","volume-title":"VoiceOver User Guide for iPhone","author":"Inc. Apple","year":"2023","unstructured":"Apple Inc.2023. VoiceOver User Guide for iPhone. https:\/\/support.apple.com\/guide\/iphone\/iph3e2e415f\/ios Accessed: 2025-01-XX."},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"publisher","DOI":"10.1145\/3282894.3282921"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","DOI":"10.1145\/1805986.1806005"},{"key":"e_1_3_3_2_8_2","unstructured":"Yuxiang Chai Siyuan Huang Yazhe Niu Han Xiao Liang Liu Dingyu Zhang Peng Gao Shuai Ren and Hongsheng Li. 2024. Amex: Android multi-annotation expo dataset for mobile gui agents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.17490 (2024)."},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.1145\/3377811.3380327"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02283"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.505"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"crossref","unstructured":"Jyoti Choudrie Sutee Pheeraphuttranghkoon and Soheil Davari. 2020. The digital divide and older adult population adoption use and diffusion of mobile phones: A quantitative study. Information Systems Frontiers 22 3 (2020) 673\u2013695.","DOI":"10.1007\/s10796-018-9875-2"},{"key":"e_1_3_3_2_13_2","volume-title":"Web Content Accessibility Guidelines (WCAG) 2.1","author":"Consortium World Wide\u00a0Web","year":"2018","unstructured":"World Wide\u00a0Web Consortium. 2018. Web Content Accessibility Guidelines (WCAG) 2.1. https:\/\/www.w3.org\/TR\/WCAG21\/ Retrieved September 23, 2023."},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","DOI":"10.1145\/3126594.3126651"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.1145\/1753326.1753554"},{"key":"e_1_3_3_2_16_2","volume-title":"TalkBack Screen Reader for Android","year":"2023","unstructured":"Google. 2023. TalkBack Screen Reader for Android. https:\/\/support.google.com\/accessibility\/android\/answer\/6007100 Accessed: 2025-01-XX."},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"crossref","unstructured":"Nora Griffin-Shirley Devender\u00a0R Banda Paul\u00a0M Ajuwon Jongpil Cheon Jaehoon Lee Hye\u00a0Ran Park and Sanpalei\u00a0N Lyngdoh. 2017. A survey on the use of mobile applications for people who are visually impaired. Journal of Visual Impairment & Blindness 111 4 (2017) 307\u2013323.","DOI":"10.1177\/0145482X1711100402"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"crossref","unstructured":"William Grussenmeyer and Eelke Folmer. 2017. Accessible touchscreen technology for people with visual impairments: a survey. ACM Transactions on Accessible Computing (TACCESS) 9 2 (2017) 1\u201331.","DOI":"10.1145\/3022701"},{"key":"e_1_3_3_2_19_2","first-page":"3929","volume-title":"International conference on machine learning","author":"Guu Kelvin","year":"2020","unstructured":"Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat, and Mingwei Chang. 2020. Retrieval augmented language model pre-training. In International conference on machine learning. PMLR, 3929\u20133938."},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01354"},{"key":"e_1_3_3_2_21_2","unstructured":"Edward\u00a0J Hu Yelong Shen Phillip Wallis Zeyuan Allen-Zhu Yuanzhi Li Shean Wang Lu Wang Weizhu Chen et\u00a0al. 2022. Lora: Low-rank adaptation of large language models. ICLR 1 2 (2022) 3."},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02238"},{"key":"e_1_3_3_2_23_2","unstructured":"Aaron Hurst Adam Lerer Adam\u00a0P Goucher Adam Perelman Aditya Ramesh Aidan Clark AJ Ostrow Akila Welihinda Alan Hayes Alec Radford et\u00a0al. 2024. Gpt-4o system card. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.21276 (2024)."},{"key":"e_1_3_3_2_24_2","unstructured":"Rashedul Islam Rofiqul Islam and Tohidul Mazumder. 2010. Mobile application and its global impact. International Journal of Engineering & Technology 10 6 (2010) 72\u201378."},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","DOI":"10.1145\/1639642.1639663"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"crossref","unstructured":"Akif Khan and Shah Khusro. 2021. An insight into smartphone-based assistive solutions for visually impaired and blind people: issues challenges and opportunities. Universal Access in the Information Society 20 2 (2021) 265\u2013298.","DOI":"10.1007\/s10209-020-00733-8"},{"key":"e_1_3_3_2_27_2","first-page":"18893","volume-title":"International Conference on Machine Learning","author":"Lee Kenton","year":"2023","unstructured":"Kenton Lee, Mandar Joshi, Iulia\u00a0Raluca Turc, Hexiang Hu, Fangyu Liu, Julian\u00a0Martin Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, and Kristina Toutanova. 2023. Pix2struct: Screenshot parsing as pretraining for visual language understanding. In International Conference on Machine Learning. PMLR, 18893\u201318912."},{"key":"e_1_3_3_2_28_2","unstructured":"Patrick Lewis Ethan Perez Aleksandra Piktus Fabio Petroni Vladimir Karpukhin Naman Goyal Heinrich K\u00fcttler Mike Lewis Wen-tau Yih Tim Rockt\u00e4schel et\u00a0al. 2020. Retrieval-augmented generation for knowledge-intensive nlp tasks. Advances in neural information processing systems 33 (2020) 9459\u20139474."},{"key":"e_1_3_3_2_29_2","unstructured":"Gang Li and Yang Li. 2022. Spotlight: Mobile ui understanding using vision-language models with a focus. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2209.14927 (2022)."},{"key":"e_1_3_3_2_30_2","first-page":"19730","volume-title":"International conference on machine learning","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730\u201319742."},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"crossref","unstructured":"Wen Li Ying Zhang Yifang Sun Wei Wang Mingjie Li Wenjie Zhang and Xuemin Lin. 2019. Approximate nearest neighbor search on high dimensional data\u2014experiments analyses and improvement. IEEE Transactions on Knowledge and Data Engineering 32 8 (2019) 1475\u20131488.","DOI":"10.1109\/TKDE.2019.2909204"},{"key":"e_1_3_3_2_32_2","unstructured":"Yang Li Gang Li Luheng He Jingjie Zheng Hong Li and Zhiwei Guan. 2020. Widget captioning: Generating natural language description for mobile user interface elements. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2010.04295 (2020)."},{"key":"e_1_3_3_2_33_2","first-page":"74","volume-title":"Text summarization branches out","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74\u201381."},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"crossref","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong\u00a0Jae Lee. 2023. Visual instruction tuning. Advances in neural information processing systems 36 (2023) 34892\u201334916.","DOI":"10.52202\/075280-1516"},{"key":"e_1_3_3_2_35_2","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1711.05101 (2017)."},{"key":"e_1_3_3_2_36_2","first-page":"260","volume-title":"Wcre","author":"Memon Atif\u00a0M","year":"2003","unstructured":"Atif\u00a0M Memon, Ishan Banerjee, and Adithya Nagarajan. 2003. GUI ripping: reverse engineering of graphical user interfaces for testing.. In Wcre , Vol.\u00a03. 260."},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"crossref","unstructured":"Alan\u00a0F Newell. 2008. Accessible computing\u2013past trends and future suggestions: Commentary on \u201cComputers and people with disabilities\u201d. ACM Transactions on Accessible Computing (TACCESS) 1 2 (2008) 1\u20137.","DOI":"10.1145\/1408760.1408763"},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"publisher","DOI":"10.1109\/ASE.2015.32"},{"key":"e_1_3_3_2_39_2","first-page":"311","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311\u2013318."},{"key":"e_1_3_3_2_40_2","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748\u20138763."},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"publisher","DOI":"10.1145\/3132525.3132547"},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"publisher","DOI":"10.1145\/3234695.3236364"},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"crossref","unstructured":"Deb Roy. 2000. Learning visually grounded words and syntax of natural spoken language. Evolution of communication 4 1 (2000) 33\u201356.","DOI":"10.1075\/eoc.4.1.04roy"},{"key":"e_1_3_3_2_44_2","unstructured":"Richard\u00a0S Schwerdtfeger. 1991. Making the GUI talk. Byte (1991) 118\u2013128."},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"crossref","unstructured":"Suraj\u00a0Singh Senjam Souvik Manna and Covadonga Bascaran. 2021. Smartphones-based assistive technology: accessibility features and apps for people with visual impairment and its usage challenges and usability testing. Clinical optometry (2021) 311\u2013322.","DOI":"10.2147\/OPTO.S336361"},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"crossref","unstructured":"Ben Shneiderman. 2000. Universal usability. Commun. ACM 43 5 (2000) 84\u201391.","DOI":"10.1145\/332833.332843"},{"key":"e_1_3_3_2_47_2","first-page":"5636","volume-title":"Proceedings of the 29th International Conference on Computational Linguistics","author":"Sunkara Srinivas","year":"2022","unstructured":"Srinivas Sunkara, Maria Wang, Lijuan Liu, Gilles Baechler, Yu-Chung Hsiao, Jindong Chen, Abhanshu Sharma, and James\u00a0WW Stout. 2022. Towards better semantic understanding of mobile interfaces. In Proceedings of the 29th International Conference on Computational Linguistics. 5636\u20135650."},{"key":"e_1_3_3_2_48_2","unstructured":"Gemini Team Rohan Anil Sebastian Borgeaud Jean-Baptiste Alayrac Jiahui Yu Radu Soricut Johan Schalkwyk Andrew\u00a0M Dai Anja Hauth Katie Millican et\u00a0al. 2023. Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.11805 (2023)."},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_3_2_50_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01481"},{"key":"e_1_3_3_2_51_2","doi-asserted-by":"publisher","DOI":"10.1145\/3472749.3474765"},{"key":"e_1_3_3_2_52_2","unstructured":"Peng Wang Shuai Bai Sinan Tan Shijie Wang Zhihao Fan Jinze Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge et\u00a0al. 2024. Qwen2-vl: Enhancing vision-language model\u2019s perception of the world at any resolution. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2409.12191 (2024)."},{"key":"e_1_3_3_2_53_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00217"},{"key":"e_1_3_3_2_54_2","doi-asserted-by":"publisher","DOI":"10.1145\/1622176.1622213"},{"key":"e_1_3_3_2_55_2","unstructured":"Shi Yu Chaoyue Tang Bokai Xu Junbo Cui Junhao Ran Yukun Yan Zhenghao Liu Shuo Wang Xu Han Zhiyuan Liu et\u00a0al. 2024. Visrag: Vision-based retrieval-augmented generation on multi-modality documents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.10594 (2024)."}],"event":{"name":"W4A '26: The 23rd International Web for All Conference","location":"Dubai United Arab Emirates","acronym":"W4A '26"},"container-title":["Proceedings of the 23rd International Web for All Conference"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3800424.3800448","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,6,3]],"date-time":"2026-06-03T03:28:00Z","timestamp":1780457280000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3800424.3800448"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,13]]},"references-count":54,"alternative-id":["10.1145\/3800424.3800448","10.1145\/3800424"],"URL":"https:\/\/doi.org\/10.1145\/3800424.3800448","relation":{},"subject":[],"published":{"date-parts":[[2026,4,13]]},"assertion":[{"value":"2026-06-02","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}