{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T11:05:39Z","timestamp":1772363139059,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":93,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,4,28]]},"DOI":"10.1145\/3744257.3744275","type":"proceedings-article","created":{"date-parts":[[2025,10,15]],"date-time":"2025-10-15T08:57:43Z","timestamp":1760518663000},"page":"36-47","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["AccessMenu: Enhancing Usability of Online Restaurant Menus for Screen Reader Users"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-5428-5052","authenticated-orcid":false,"given":"Nithiya","family":"Venkatraman","sequence":"first","affiliation":[{"name":"Department of Computer Science, Old Dominion University, Norfolk, Virginia, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-9992-9706","authenticated-orcid":false,"given":"Akshay","family":"Kolgar Nayak","sequence":"additional","affiliation":[{"name":"Department of Computer Science, Old Dominion University, Norfolk, Virginia, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-3785-6013","authenticated-orcid":false,"given":"Suyog","family":"Dahal","sequence":"additional","affiliation":[{"name":"Department of Computer Science, Old Dominion University, Norfolk, Virginia, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8593-327X","authenticated-orcid":false,"given":"Yash","family":"Prakash","sequence":"additional","affiliation":[{"name":"Department of Computer Science, Old Dominion University, Norfolk, Virginia, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2183-1722","authenticated-orcid":false,"given":"Hae-Na","family":"Lee","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Michigan State University, East Lansing, Michigan, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4772-1265","authenticated-orcid":false,"given":"Vikas","family":"Ashok","sequence":"additional","affiliation":[{"name":"Department of Computer Science, Old Dominion University, Norfolk, Virginia, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,10,15]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"[n. d.]. claude. ([n. d.]). https:\/\/www.anthropic.com\/news\/claude-3-5-sonnet"},{"key":"e_1_3_3_2_3_2","unstructured":"[n. d.]. gpt4. ([n. d.]). https:\/\/openai.com\/index\/gpt-4o-mini-advancing-cost-efficient-intelligence\/"},{"key":"e_1_3_3_2_4_2","unstructured":"[n. d.]. llama. ([n. d.]). https:\/\/www.llama.com\/"},{"key":"e_1_3_3_2_5_2","unstructured":"[n. d.]. w3org. ([n. d.]). https:\/\/www.w3.org\/TR\/2006\/WDaria-roadmap-20060926\/"},{"key":"e_1_3_3_2_6_2","unstructured":"Nov 2008. WCAG. (Nov 2008). https:\/\/www.w3.org\/WAI\/WCAG22\/quickref\/?versions=2.1#text-alternatives\/"},{"key":"e_1_3_3_2_7_2","unstructured":"Nov 2008. webaim. (Nov 2008). https:\/\/webaim.org\/techniques\/alttext\/"},{"key":"e_1_3_3_2_8_2","unstructured":"NV Access. 2018. NVDA screen-reader."},{"key":"e_1_3_3_2_9_2","first-page":"693","volume-title":"Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: Industry Track","author":"Aggarwal Kriti","year":"2023","unstructured":"Kriti Aggarwal, Aditi Khandelwal, Kumar Tanmay, Owais\u00a0Khan Mohammed, Qiang Liu, Monojit Choudhury, Hardik Chauhan, Subhojit Som, Vishrav Chaudhary, and Saurabh Tiwary. 2023. DUBLIN: Visual Document Understanding By Language-Image Network. In Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: Industry Track. 693\u2013706."},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"crossref","unstructured":"Patrizia Andronico Marina Buzzi Carlos Castillo and Barbara Leporini. 2006. Improving search engine interfaces for blind users: a case study. Universal Access in the Information Society 5 (2006) 23\u201340.","DOI":"10.1007\/s10209-006-0022-3"},{"key":"e_1_3_3_2_11_2","volume-title":"VoiceOver","author":"Apple Inc","year":"2023","unstructured":"Inc Apple. 2023. VoiceOver. https:\/\/www.apple.com\/voiceover\/info\/guide\/_1121.html"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","DOI":"10.1145\/3320435.3320460"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1145\/3025171.3025229"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","DOI":"10.4018\/978-1-60566-026-4.ch646"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.1145\/3132525.3132531"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","DOI":"10.1145\/3173574.3173594"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","DOI":"10.1145\/1805986.1806005"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"crossref","unstructured":"James\u00a0V Bradley. 1958. Complete counterbalancing of immediate sequential effects in a Latin square design. J. Amer. Statist. Assoc. 53 282 (1958) 525\u2013528.","DOI":"10.1080\/01621459.1958.10501456"},{"key":"e_1_3_3_2_19_2","unstructured":"John Brooke. 1996. SUS: A \u2018quick and dirty\u2019usability scale. Usability Evaluation in Industry. PW Jordan B Thomas BA Weerdmeester and AL McClelland."},{"key":"e_1_3_3_2_20_2","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared\u00a0D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et\u00a0al. 2020. Language models are few-shot learners. Advances in neural information processing systems 33 (2020) 1877\u20131901."},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"crossref","unstructured":"Giovanni Campagna Silei Xu Rakesh Ramesh Michael Fischer and Monica\u00a0S Lam. 2018. Controlling fine-grain sharing in natural language with a virtual assistant. Proceedings of the ACM on Interactive Mobile Wearable and Ubiquitous Technologies 2 3 (2018) 1\u201328.","DOI":"10.1145\/3264905"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"crossref","unstructured":"Loredana Caruccio Stefano Cirillo Giuseppe Polese Giandomenico Solimando Shanmugam Sundaramurthy and Genoveffa Tortora. 2024. Claude 2.0 large language model: Tackling a real-world classification problem with a new iterative prompt engineering approach. Intelligent Systems with Applications 21 (2024) 200336.","DOI":"10.1016\/j.iswa.2024.200336"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-43427-3_35"},{"key":"e_1_3_3_2_24_2","unstructured":"DocuSign. 2025. Join More Than 1 Billion Users Who Trust DocuSign. https:\/\/www.docusign.com Accessed: [Your Access Date]."},{"key":"e_1_3_3_2_25_2","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Amy Yang Angela Fan et\u00a0al. 2024. The llama 3 herd of models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.21783 (2024)."},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"crossref","unstructured":"Yansong Feng and Mirella Lapata. 2012. Automatic caption generation for news images. IEEE transactions on pattern analysis and machine intelligence 35 4 (2012) 797\u2013812.","DOI":"10.1109\/TPAMI.2012.118"},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","DOI":"10.1145\/3490099.3511126"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","DOI":"10.1145\/3412841.3442066"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/EDUCON.2016.7474628"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"crossref","unstructured":"Boni Garc\u00eda Mario Munoz-Organero Carlos Alario-Hoyos and Carlos\u00a0Delgado Kloos. 2021. Automated driver management for selenium WebDriver. Empirical Software Engineering 26 (2021) 1\u201351.","DOI":"10.1007\/s10664-021-09975-3"},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376728"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","DOI":"10.1145\/3173574.3174092"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"publisher","DOI":"10.1016\/S0166-4115(08)62386-9"},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-78092-0_15"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i10.21322"},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548112"},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606735"},{"key":"e_1_3_3_2_38_2","volume-title":"Workshop on Document Intelligence at NeurIPS 2019","author":"Hwang Wonseok","year":"2019","unstructured":"Wonseok Hwang, Seonghyeon Kim, Minjoon Seo, Jinyeong Yim, Seunghyun Park, Sungrae Park, Junyeop Lee, Bado Lee, and Hwalsuk Lee. 2019. Post-ocr parsing: building simple and robust parser via bio tagging. In Workshop on Document Intelligence at NeurIPS 2019."},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"crossref","unstructured":"Md\u00a0Farhan Ishmam Md\u00a0Sakib\u00a0Hossain Shovon Muhammad\u00a0Firoz Mridha and Nilanjan Dey. 2024. From image to language: A critical analysis of visual question answering (vqa) approaches challenges and opportunities. Information Fusion 106 (2024) 102270.","DOI":"10.1016\/j.inffus.2024.102270"},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00592"},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV45572.2020.9093494"},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"crossref","unstructured":"Muiz\u00a0Ahmed Khan Pias Paul Mahmudur Rashid Mainul Hossain and Md\u00a0Atiqur\u00a0Rahman Ahad. 2020. An AI-based visual aid with integrated reading assistant for the completely blind. IEEE Transactions on Human-Machine Systems 50 6 (2020) 507\u2013517.","DOI":"10.1109\/THMS.2020.3027534"},{"key":"e_1_3_3_2_43_2","unstructured":"Geewook Kim Teakgyu Hong Moonbin Yim Jinyoung Park Jinyeong Yim Wonseok Hwang Sangdoo Yun Dongyoon Han and Seunghyun Park. 2021. Donut: Document understanding transformer without ocr. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2111.15664 7 15 (2021) 2."},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"crossref","unstructured":"Jonathan Lazar Aaron Allen Jason Kleinman and Chris Malarkey. 2007. What frustrates screen reader users on the web: A study of 100 blind users. International Journal of human-computer interaction 22 3 (2007) 247\u2013269.","DOI":"10.1080\/10447310709336964"},{"key":"e_1_3_3_2_45_2","unstructured":"Jonathan Lazar Abiodun Olalere and Brian Wentz. 2012. Investigating the accessibility and usability of job application web sites for blind users. Journal of Usability Studies 7 2 (2012)."},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"crossref","unstructured":"Hae-Na Lee and Vikas Ashok. 2022. Customizable Tabular Access to Web Data Records for Convenient Low-vision Screen Magnifier Interaction. ACM Transactions on Accessible Computing (TACCESS) 15 2 (2022) 1\u201322.","DOI":"10.1145\/3517044"},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-60149-2_23"},{"key":"e_1_3_3_2_48_2","first-page":"18893","volume-title":"International Conference on Machine Learning","author":"Lee Kenton","year":"2023","unstructured":"Kenton Lee, Mandar Joshi, Iulia\u00a0Raluca Turc, Hexiang Hu, Fangyu Liu, Julian\u00a0Martin Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, and Kristina Toutanova. 2023. Pix2struct: Screenshot parsing as pretraining for visual language understanding. In International Conference on Machine Learning. PMLR, 18893\u201318912."},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"crossref","unstructured":"Maurizio Leotta Fabrizio Mori and Marina Ribaudo. 2023. Evaluating the effectiveness of automatic image captioning for web accessibility. Universal access in the information society 22 4 (2023) 1293\u20131313.","DOI":"10.1007\/s10209-022-00906-7"},{"key":"e_1_3_3_2_50_2","first-page":"19730","volume-title":"International conference on machine learning","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730\u201319742."},{"key":"e_1_3_3_2_51_2","doi-asserted-by":"crossref","unstructured":"Sheng Li Zhiqiang Tao Kang Li and Yun Fu. 2019. Visual to text: Survey of image and video captioning. IEEE Transactions on Emerging Topics in Computational Intelligence 3 4 (2019) 297\u2013312.","DOI":"10.1109\/TETCI.2019.2892755"},{"key":"e_1_3_3_2_52_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01472"},{"key":"e_1_3_3_2_53_2","unstructured":"Yuanchun Li Hao Wen Weijun Wang Xiangyu Li Yizhen Yuan Guohong Liu Jiacheng Liu Wenxing Xu Xiang Wang Yi Sun et\u00a0al. 2024. Personal llm agents: Insights and survey about the capability efficiency and security. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.05459 (2024)."},{"key":"e_1_3_3_2_54_2","doi-asserted-by":"publisher","DOI":"10.1109\/GHTC.2016.7857276"},{"key":"e_1_3_3_2_55_2","doi-asserted-by":"publisher","DOI":"10.1145\/3371300.3383343"},{"key":"e_1_3_3_2_56_2","unstructured":"Fangyu Liu Francesco Piccinno Syrine Krichene Chenxi Pang Kenton Lee Mandar Joshi Yasemin Altun Nigel Collier and Julian\u00a0Martin Eisenschlos. 2022. Matcha: Enhancing visual language pretraining with math reasoning and chart derendering. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2212.09662 (2022)."},{"key":"e_1_3_3_2_57_2","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong\u00a0Jae Lee. 2024. Visual instruction tuning. Advances in neural information processing systems 36 (2024)."},{"key":"e_1_3_3_2_58_2","unstructured":"Yuliang Liu Zhang Li Biao Yang Chunyuan Li Xucheng Yin Cheng-lin Liu Lianwen Jin and Xiang Bai. 2023. On the hidden mystery of ocr in large multimodal models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.07895 (2023)."},{"key":"e_1_3_3_2_59_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00225"},{"key":"e_1_3_3_2_60_2","doi-asserted-by":"publisher","DOI":"10.1145\/2745555.2746666"},{"key":"e_1_3_3_2_61_2","doi-asserted-by":"crossref","unstructured":"Yue Ming Nannan Hu Chunxiao Fan Fan Feng Jiangwan Zhou and Hui Yu. 2022. Visuals to text: A comprehensive review on automatic image captioning. IEEE\/CAA Journal of Automatica Sinica 9 8 (2022) 1339\u20131365.","DOI":"10.1109\/JAS.2022.105734"},{"key":"e_1_3_3_2_62_2","doi-asserted-by":"crossref","unstructured":"Juan Nino Sherezada Ochoa Jocelyne Kiss Geoffreyjen Edwards Ernesto Morales James Hutson Fr\u00e9d\u00e9rique Poncet and Walter Wittich. 2024. Assistive Technologies for Internet Navigation: A Review of Screen Reader Solutions for the Blind and Visually Impaired. International Journal of Recent Engineering Science 11 6 (2024).","DOI":"10.14445\/23497157\/IJRES-V11I6P122"},{"key":"e_1_3_3_2_63_2","unstructured":"Vincent Perot Kai Kang Florian Luisier Guolong Su Xiaoyu Sun Ramya\u00a0Sree Boppana Zilong Wang Zifeng Wang Jiaqi Mu Hao Zhang et\u00a0al. 2023. Lmdx: Language model-based document information extraction and localization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.10952 (2023)."},{"key":"e_1_3_3_2_64_2","doi-asserted-by":"publisher","DOI":"10.1145\/3678957.3685714"},{"key":"e_1_3_3_2_65_2","doi-asserted-by":"crossref","unstructured":"Yash Prakash Akshay\u00a0Kolgar Nayak Mohan Sunkara Sampath Jayarathna Hae-Na Lee and Vikas Ashok. 2024. All in One Place: Ensuring Usable Access to Online Shopping Items for Blind Users. Proceedings of the ACM on Human-Computer Interaction 8 EICS (2024) 1\u201325.","DOI":"10.1145\/3664639"},{"key":"e_1_3_3_2_66_2","doi-asserted-by":"publisher","DOI":"10.1145\/3581641.3584049"},{"key":"e_1_3_3_2_67_2","doi-asserted-by":"publisher","DOI":"10.1145\/2384916.2384999"},{"key":"e_1_3_3_2_68_2","doi-asserted-by":"publisher","DOI":"10.1145\/2207016.2207054"},{"key":"e_1_3_3_2_69_2","volume-title":"The coding manual for qualitative researchers","author":"Salda\u00f1a Johnny","year":"2015","unstructured":"Johnny Salda\u00f1a. 2015. The coding manual for qualitative researchers. Sage."},{"key":"e_1_3_3_2_70_2","unstructured":"Johnny Salda\u00f1a. 2021. The coding manual for qualitative researchers. (2021)."},{"key":"e_1_3_3_2_71_2","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445242"},{"key":"e_1_3_3_2_72_2","unstructured":"Freedom Scientific. 2020. JAWS OCR What It Is and How It Works!https:\/\/www.freedomscientific.com\/webinars\/jaws-ocr-what-it-is-and-how-it-works\/ Accessed: Mar. 7 2025."},{"key":"e_1_3_3_2_73_2","unstructured":"Freedom Scientific. 2020. JAWS \u00ae \u2013 Freedom Scientific. http:\/\/www.freedomscientific.com\/products\/software\/jaws\/."},{"key":"e_1_3_3_2_74_2","doi-asserted-by":"crossref","unstructured":"Woosuk Seo and Hyunggu Jung. 2022. Challenges and opportunities to improve the accessibility of YouTube for people with visual impairments as content creators. Universal Access in the Information Society 21 3 (2022) 767\u2013770.","DOI":"10.1007\/s10209-020-00787-8"},{"key":"e_1_3_3_2_75_2","doi-asserted-by":"crossref","unstructured":"Weishi Shi Heather Moses Qi Yu Samuel Malachowsky and Daniel\u00a0E Krutz. 2023. All: Supporting experiential accessibility education and inclusive software development. ACM Transactions on Software Engineering and Methodology 33 2 (2023) 1\u201330.","DOI":"10.1145\/3625292"},{"key":"e_1_3_3_2_76_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICEECCOT.2017.8284628"},{"key":"e_1_3_3_2_77_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-16-1092-9_7"},{"key":"e_1_3_3_2_78_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72437-4_12"},{"key":"e_1_3_3_2_79_2","doi-asserted-by":"crossref","unstructured":"Mohan Sunkara Yash Prakash Hae-Na Lee Sampath Jayarathna and Vikas Ashok. 2023. Enabling Customization of Discussion Forums for Blind Users. Proceedings of the ACM on Human-Computer Interaction 7 EICS (2023) 1\u201320.","DOI":"10.1145\/3593228"},{"key":"e_1_3_3_2_80_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01845"},{"key":"e_1_3_3_2_81_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-62846-7_3"},{"key":"e_1_3_3_2_82_2","doi-asserted-by":"crossref","unstructured":"Mary\u00a0Frances Theofanos and Janice Redish. 2003. Bridging the gap: between accessibility and usability. interactions 10 6 (2003) 36\u201351.","DOI":"10.1145\/947226.947227"},{"key":"e_1_3_3_2_83_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-86337-1_42"},{"key":"e_1_3_3_2_84_2","doi-asserted-by":"crossref","unstructured":"Tejal Tiwary and Rajendra\u00a0Prasad Mahapatra. 2023. Enhancement in web accessibility for visually impaired people using hybrid deep belief network\u2013bald eagle search. Multimedia Tools and Applications (2023) 1\u201322.","DOI":"10.1007\/s11042-023-14494-y"},{"key":"e_1_3_3_2_85_2","doi-asserted-by":"crossref","unstructured":"Utku Uckun Rohan\u00a0Tumkur Suresh Md\u00a0Javedul Ferdous Xiaojun Bi IV Ramakrishnan and Vikas Ashok. 2022. Taming User-Interface Heterogeneity with Uniform Overlays for Blind Users. (2022).","DOI":"10.1145\/3503252.3531317"},{"key":"e_1_3_3_2_86_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-25364-5_26"},{"key":"e_1_3_3_2_87_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.416"},{"key":"e_1_3_3_2_88_2","unstructured":"Jason Wei Maarten Bosma Vincent\u00a0Y Zhao Kelvin Guu Adams\u00a0Wei Yu Brian Lester Nan Du Andrew\u00a0M Dai and Quoc\u00a0V Le. 2021. Finetuned language models are zero-shot learners. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2109.01652 (2021)."},{"key":"e_1_3_3_2_89_2","unstructured":"Jason Wei Xuezhi Wang Dale Schuurmans Maarten Bosma Fei Xia Ed Chi Quoc\u00a0V Le Denny Zhou et\u00a0al. 2022. Chain-of-thought prompting elicits reasoning in large language models. Advances in neural information processing systems 35 (2022) 24824\u201324837."},{"key":"e_1_3_3_2_90_2","unstructured":"Yang Xu Yiheng Xu Tengchao Lv Lei Cui Furu Wei Guoxin Wang Yijuan Lu Dinei Florencio Cha Zhang Wanxiang Che et\u00a0al. 2020. Layoutlmv2: Multi-modal pre-training for visually-rich document understanding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2012.14740 (2020)."},{"key":"e_1_3_3_2_91_2","unstructured":"Zhengyuan Yang Linjie Li Kevin Lin Jianfeng Wang Chung-Ching Lin Zicheng Liu and Lijuan Wang. 2023. The dawn of lmms: Preliminary explorations with gpt-4v (vision). arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.17421 9 1 (2023) 1."},{"key":"e_1_3_3_2_92_2","unstructured":"Qinghao Ye Haiyang Xu Guohai Xu Jiabo Ye Ming Yan Yiyang Zhou Junyang Wang Anwen Hu Pengcheng Shi Yaya Shi et\u00a0al. 2023. mplug-owl: Modularization empowers large language models with multimodality. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2304.14178 (2023)."},{"key":"e_1_3_3_2_93_2","unstructured":"Deyao Zhu Jun Chen Xiaoqian Shen Xiang Li and Mohamed Elhoseiny. 2023. Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2304.10592 (2023)."},{"key":"e_1_3_3_2_94_2","unstructured":"Wang Zhu Alekh Agarwal Mandar Joshi Robin Jia Jesse Thomason and Kristina Toutanova. 2023. Efficient End-to-End Visual Document Understanding with Rationale Distillation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.09612 (2023)."}],"event":{"name":"W4A '25: The 22nd International Web for All Conference","location":"Sydney Australia","acronym":"W4A '25"},"container-title":["Proceedings of the 22nd International Web for All Conference"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3744257.3744275","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,15]],"date-time":"2025-10-15T09:26:23Z","timestamp":1760520383000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3744257.3744275"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,28]]},"references-count":93,"alternative-id":["10.1145\/3744257.3744275","10.1145\/3744257"],"URL":"https:\/\/doi.org\/10.1145\/3744257.3744275","relation":{},"subject":[],"published":{"date-parts":[[2025,4,28]]},"assertion":[{"value":"2025-10-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}