{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T20:52:05Z","timestamp":1776113525597,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":55,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,4,22]],"date-time":"2025-04-22T00:00:00Z","timestamp":1745280000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62372408"],"award-info":[{"award-number":["62372408"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,4,28]]},"DOI":"10.1145\/3696410.3714523","type":"proceedings-article","created":{"date-parts":[[2025,4,22]],"date-time":"2025-04-22T22:47:11Z","timestamp":1745362031000},"page":"5096-5107","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Towards an Inclusive Mobile Web: A Dataset and Framework for Focusability in UI Accessibility"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-2951-5256","authenticated-orcid":false,"given":"Ming","family":"Gu","sequence":"first","affiliation":[{"name":"College of Computer Science and Technology, Zhejiang University, Hangzhou, Zhejiang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-1212-7313","authenticated-orcid":false,"given":"Lei","family":"Pei","sequence":"additional","affiliation":[{"name":"School of Software Technology, Zhejiang University, Ningbo, Zhejiang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3645-1041","authenticated-orcid":false,"given":"Sheng","family":"Zhou","sequence":"additional","affiliation":[{"name":"Zhejiang Key Laboratory of Accessible Perception and Intelligent Systems, Zhejiang University, Hangzhou, Zhejiang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-4321-5905","authenticated-orcid":false,"given":"Ming","family":"Shen","sequence":"additional","affiliation":[{"name":"School of Software Technology, Zhejiang University, Ningbo, Zhejiang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0189-4011","authenticated-orcid":false,"given":"Yuxuan","family":"Wu","sequence":"additional","affiliation":[{"name":"School of Software Technology, Zhejiang University, Ningbo, Zhejiang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5736-4805","authenticated-orcid":false,"given":"Zirui","family":"Gao","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, Zhejiang University, Hangzhou, Zhejiang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4479-3738","authenticated-orcid":false,"given":"Ziwei","family":"Wang","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, Zhejiang University, Hangzhou, Zhejiang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-6695-7954","authenticated-orcid":false,"given":"Shuo","family":"Shan","sequence":"additional","affiliation":[{"name":"Ant Group, Hangzhou, Zhejiang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-6605-9793","authenticated-orcid":false,"given":"Wei","family":"Jiang","sequence":"additional","affiliation":[{"name":"Ant Group, Hangzhou, Zhejiang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-1664-6425","authenticated-orcid":false,"given":"Yong","family":"Li","sequence":"additional","affiliation":[{"name":"Ant Group, Hangzhou, Zhejiang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1097-2044","authenticated-orcid":false,"given":"Jiajun","family":"Bu","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, Zhejiang University, Hangzhou, Zhejiang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,4,22]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.57197\/JDR-2023-0060"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3377811.3380392"},{"key":"e_1_3_2_1_3_1","volume-title":"Screenai: A vision-language model for ui and infographics understanding. arXiv preprint arXiv:2402.04615","author":"Baechler Gilles","year":"2024","unstructured":"Gilles Baechler, Srinivas Sunkara, Maria Wang, Fedir Zubach, Hassan Mansoor, Vincent Etter, Victor C\u0103rbune, Jason Lin, Jindong Chen, and Abhanshu Sharma. 2024. Screenai: A vision-language model for ui and infographics understanding. arXiv preprint arXiv:2402.04615 (2024)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445762"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_6_1","unstructured":"Dongping Chen Yue Huang Siyuan Wu Jingyu Tang Liuyi Chen Yilin Bai Zhigang He Chenlong Wang Huichi Zhou Yiqiang Li et al. 2024b. GUI-WORLD: A Dataset for GUI-oriented Multimodal LLM-based Agents. arXiv preprint arXiv:2406.10819 (2024)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3391613"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3377811.3380327"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3597503.3623313"},{"key":"e_1_3_2_1_10_1","volume-title":"InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks. arXiv preprint arXiv:2312.14238","author":"Chen Zhe","year":"2023","unstructured":"Zhe Chen, Jiannan Wu, Wenhai Wang, Weijie Su, Guo Chen, Sen Xing, Muyan Zhong, Qinglong Zhang, Xizhou Zhu, Lewei Lu, Bin Li, Ping Luo, Tong Lu, Yu Qiao, and Jifeng Dai. 2023. InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks. arXiv preprint arXiv:2312.14238 (2023)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3126594.3126651"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/2984511.2984581"},{"key":"e_1_3_2_1_13_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_14_1","volume-title":"An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642350"},{"key":"e_1_3_2_1_16_1","volume-title":"Sign: Scalable inception graph neural networks. arXiv preprint arXiv:2004.11198","author":"Frasca Fabrizio","year":"2020","unstructured":"Fabrizio Frasca, Emanuele Rossi, Davide Eynard, Ben Chamberlain, Michael Bronstein, and Federico Monti. 2020. Sign: Scalable inception graph neural networks. arXiv preprint arXiv:2004.11198 (2020)."},{"key":"e_1_3_2_1_17_1","unstructured":"Google. 2020. Get started on Android with TalkBack - Android Accessibility Help. https:\/\/support.google.com\/accessibility\/android\/answer\/6283677?hl=en Accessed: 2024--12-02."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3583780.3614915"},{"key":"e_1_3_2_1_19_1","volume-title":"Universal Inceptive GNNs by Eliminating the Smoothness-generalization Dilemma. arXiv preprint arXiv:2412.09805","author":"Gu Ming","year":"2024","unstructured":"Ming Gu, Zhuonan Zheng, Sheng Zhou, Meihan Liu, Jiawei Chen, Tanyu Qiao, Liangcheng Li, and Jiajun Bu. 2024. Universal Inceptive GNNs by Eliminating the Smoothness-generalization Dilemma. arXiv preprint arXiv:2412.09805 (2024)."},{"key":"e_1_3_2_1_20_1","volume-title":"Inductive representation learning on large graphs. Advances in neural information processing systems","author":"Hamilton Will","year":"2017","unstructured":"Will Hamilton, Zhitao Ying, and Jure Leskovec. 2017. Inductive representation learning on large graphs. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01354"},{"key":"e_1_3_2_1_22_1","volume-title":"Semi-supervised classification with graph convolutional networks. arXiv preprint arXiv:1609.02907","author":"Kipf Thomas N","year":"2016","unstructured":"Thomas N Kipf and Max Welling. 2016. Semi-supervised classification with graph convolutional networks. arXiv preprint arXiv:1609.02907 (2016)."},{"key":"e_1_3_2_1_23_1","unstructured":"Jonathan Lazar Daniel F Goldstein and Anne Taylor. 2015. Ensuring digital accessibility through process and policy. Morgan kaufmann."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3502042"},{"key":"e_1_3_2_1_25_1","volume-title":"Widget captioning: Generating natural language description for mobile user interface elements. arXiv preprint arXiv:2010.04295","author":"Li Yang","year":"2020","unstructured":"Yang Li, Gang Li, Luheng He, Jingjie Zheng, Hong Li, and Zhiwei Guan. 2020. Widget captioning: Generating natural language description for mobile user interface elements. arXiv preprint arXiv:2010.04295 (2020)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3242587.3242650"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3242587.3242650"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3551349.3560424"},{"key":"e_1_3_2_1_29_1","volume-title":"GUI metadata, and labeled images of GUI components.","author":"Moran Kevin","year":"2018","unstructured":"Kevin Moran, CB Cardenas, M Curcio, R Bonett, and D Poshyvanyk. 2018. The ReDraw dataset: A set of Android screenshots, GUI metadata, and labeled images of GUI components. (2018)."},{"key":"e_1_3_2_1_30_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Rawles Christopher","year":"2024","unstructured":"Christopher Rawles, Alice Li, Daniel Rodriguez, Oriana Riva, and Timothy Lillicrap. 2024. Androidinthewild: A large-scale dataset for android device control. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3132525.3132547"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445455"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445455"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3517497"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.60087\/jklst.vol2.n2.p433"},{"key":"e_1_3_2_1_36_1","volume-title":"The Eleventh International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=wKPmPBHSnT6","author":"Song Yunchong","year":"2023","unstructured":"Yunchong Song, Chenghu Zhou, Xinbing Wang, and Zhouhan Lin. 2023. Ordered GNN: Ordering Message Passing to Deal with Heterophily and Over-smoothing. In The Eleventh International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=wKPmPBHSnT6"},{"key":"e_1_3_2_1_37_1","volume-title":"Meta-gui: Towards multi-modal conversational agents on mobile gui. arXiv preprint arXiv:2205.11029","author":"Sun Liangtai","year":"2022","unstructured":"Liangtai Sun, Xingyu Chen, Lu Chen, Tianle Dai, Zichen Zhu, and Kai Yu. 2022. Meta-gui: Towards multi-modal conversational agents on mobile gui. arXiv preprint arXiv:2205.11029 (2022)."},{"key":"e_1_3_2_1_38_1","unstructured":"AutoGUI Team. 2024. AutoGUI-v1--702k Dataset. https:\/\/huggingface.co\/datasets\/AutoGUI\/AutoGUI-v1--702k. Accessed: 2024--11--16."},{"key":"e_1_3_2_1_39_1","volume-title":"Attention is all you need. Advances in Neural Information Processing Systems","author":"Vaswani A","year":"2017","unstructured":"A Vaswani. 2017. Attention is all you need. Advances in Neural Information Processing Systems (2017)."},{"key":"e_1_3_2_1_40_1","unstructured":"Petar Velickovic Guillem Cucurull Arantxa Casanova Adriana Romero Pietro Lio Yoshua Bengio et al. 2017. Graph attention networks. stat Vol. 1050 20 (2017) 10--48550."},{"key":"e_1_3_2_1_41_1","unstructured":"W3C Web Accessibility Initiative (WAI). 2008. Mobile Accessibility at W3C. https:\/\/www.w3.org\/WAI\/standards-guidelines\/mobile\/ Accessed: 2024--12-02."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3472749.3474765"},{"key":"e_1_3_2_1_43_1","volume-title":"Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution. arXiv preprint arXiv:2409.12191","author":"Wang Peng","year":"2024","unstructured":"Peng Wang, Shuai Bai, Sinan Tan, Shijie Wang, Zhihao Fan, Jinze Bai, Keqin Chen, Xuejing Liu, Jialin Wang, Wenbin Ge, Yang Fan, Kai Dang, Mengfei Du, Xuancheng Ren, Rui Men, Dayiheng Liu, Chang Zhou, Jingren Zhou, and Junyang Lin. 2024. Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution. arXiv preprint arXiv:2409.12191 (2024)."},{"key":"e_1_3_2_1_44_1","volume-title":"Introduction to Web Accessibility. https:\/\/www.w3.org\/WAI\/fundamentals\/accessibility-intro\/ Accessed: 2024--11--26. First published","author":"Web Accessibility","year":"2005","unstructured":"Web Accessibility Initiative (WAI). 2024. Introduction to Web Accessibility. https:\/\/www.w3.org\/WAI\/fundamentals\/accessibility-intro\/ Accessed: 2024--11--26. First published: February 2005. Last updated: 7 March 2024."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3636534.3649379"},{"key":"e_1_3_2_1_46_1","volume-title":"Mobilevlm: A vision-language model for better intra-and inter-ui understanding. arXiv preprint arXiv:2409.14818","author":"Wu Qinzhuo","year":"2024","unstructured":"Qinzhuo Wu, Weikai Xu, Wei Liu, Tao Tan, Jianfeng Liu, Ang Li, Jian Luan, Bin Wang, and Shuo Shang. 2024a. Mobilevlm: A vision-language model for better intra-and inter-ui understanding. arXiv preprint arXiv:2409.14818 (2024)."},{"key":"e_1_3_2_1_47_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Wu Qitian","year":"2024","unstructured":"Qitian Wu, Wentao Zhao, Chenxiao Yang, Hengrui Zhang, Fan Nie, Haitian Jiang, Yatao Bian, and Junchi Yan. 2024b. Simplifying and empowering transformers for large-graph representations. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.displa.2024.102679"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCC59590.2023.10507590"},{"key":"e_1_3_2_1_50_1","volume-title":"IBM strengthens mobile app accessibility and usability","author":"Yan Shunguo","year":"2016","unstructured":"Shunguo Yan. 2016. IBM strengthens mobile app accessibility and usability. IBM, Armonk, NY, USA (2016)."},{"key":"e_1_3_2_1_51_1","unstructured":"Yuan Yao Tianyu Yu Ao Zhang Chongyi Wang Junbo Cui Hongji Zhu Tianchi Cai Haoyu Li Weilin Zhao Zhihui He et al. 2024. MiniCPM-V: A GPT-4V Level MLLM on Your Phone. arXiv preprint arXiv:2408.01800 (2024)."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73039-9_14"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.infsof.2024.107518"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445186"},{"key":"e_1_3_2_1_55_1","volume-title":"You only look at screens: Multimodal chain-of-action agents. arXiv preprint arXiv:2309.11436","author":"Zhang Zhuosheng","year":"2023","unstructured":"Zhuosheng Zhang and Aston Zhang. 2023. You only look at screens: Multimodal chain-of-action agents. arXiv preprint arXiv:2309.11436 (2023)."}],"event":{"name":"WWW '25: The ACM Web Conference 2025","location":"Sydney NSW Australia","acronym":"WWW '25","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"]},"container-title":["Proceedings of the ACM on Web Conference 2025"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696410.3714523","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3696410.3714523","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:32Z","timestamp":1750295912000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696410.3714523"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,22]]},"references-count":55,"alternative-id":["10.1145\/3696410.3714523","10.1145\/3696410"],"URL":"https:\/\/doi.org\/10.1145\/3696410.3714523","relation":{},"subject":[],"published":{"date-parts":[[2025,4,22]]},"assertion":[{"value":"2025-04-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}