{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,13]],"date-time":"2026-06-13T07:22:23Z","timestamp":1781335343502,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":89,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,11]],"date-time":"2024-10-11T00:00:00Z","timestamp":1728604800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,13]]},"DOI":"10.1145\/3654777.3676375","type":"proceedings-article","created":{"date-parts":[[2024,10,11]],"date-time":"2024-10-11T10:50:36Z","timestamp":1728643836000},"page":"1-18","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":62,"title":["WorldScribe: Towards Context-Aware Live Visual Descriptions"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7545-4136","authenticated-orcid":false,"given":"Ruei-Che","family":"Chang","sequence":"first","affiliation":[{"name":"Computer Science and Engineering, University of Michigan, United States"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-5023-1426","authenticated-orcid":false,"given":"Yuxuan","family":"Liu","sequence":"additional","affiliation":[{"name":"Computer Science and Engineering, University of Michigan, United States"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4447-7818","authenticated-orcid":false,"given":"Anhong","family":"Guo","sequence":"additional","affiliation":[{"name":"Computer Science and Engineering, University of Michigan, United States"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,10,11]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2024. Aira. https:\/\/aira.io\/"},{"key":"e_1_3_2_1_2_1","unstructured":"2024. BeMyEyes. https:\/\/www.bemyeyes.com\/"},{"key":"e_1_3_2_1_3_1","unstructured":"2024. BlindSquare. https:\/\/www.blindsquare.com\/"},{"key":"e_1_3_2_1_4_1","unstructured":"2024. Envision AI. https:\/\/www.letsenvision.com\/"},{"key":"e_1_3_2_1_5_1","unstructured":"2024. GPT-4. https:\/\/openai.com\/index\/gpt-4\/"},{"key":"e_1_3_2_1_6_1","unstructured":"2024. GPT-4 Omni. https:\/\/openai.com\/index\/hello-gpt-4o\/"},{"key":"e_1_3_2_1_7_1","unstructured":"2024. GPT-4 Vision. https:\/\/platform.openai.com\/docs\/guides\/vision"},{"key":"e_1_3_2_1_8_1","unstructured":"2024. Introducing Be My AI (formerly Virtual Volunteer) for People who are Blind or Have Low Vision Powered by OpenAI\u2019s GPT-4. https:\/\/www.bemyeyes.com\/blog\/introducing-be-my-eyes-virtual-volunteer"},{"key":"e_1_3_2_1_9_1","unstructured":"2024. Microsoft Soundscape. https:\/\/www.microsoft.com\/en-us\/research\/product\/soundscape\/"},{"key":"e_1_3_2_1_10_1","unstructured":"2024. Moondream. https:\/\/moondream.ai\/"},{"key":"e_1_3_2_1_11_1","unstructured":"2024. SeeingAI. https:\/\/www.seeingai.com\/"},{"key":"e_1_3_2_1_12_1","unstructured":"2024. Sound Analysis: Classify various sounds by analyzing audio files or streams.https:\/\/developer.apple.com\/documentation\/soundanalysis\/"},{"key":"e_1_3_2_1_13_1","unstructured":"2024. Specific Guidelines: Art Photos and Cartoons. http:\/\/diagramcenter.org\/specific-guidelines-final-draft.html"},{"key":"e_1_3_2_1_14_1","unstructured":"3PlayMedia. 2020. Beginner\u2019s Guide to Audio Description. https:\/\/go.3playmedia.com\/hubfs\/WP%20PDFs\/Beginners-Guide-to-Audio-Description.pdf. Accessed: 2021-01-13."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ATSIP49331.2020.9231933"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/2935334.2935361"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376143"},{"key":"e_1_3_2_1_18_1","unstructured":"Audio Description\u00a0Project American Council of\u00a0the Blind. 2017. Guideline for Audio Describers. https:\/\/www.acb.org\/adp\/guidelines.html. Accessed: 2020-11-6."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/1866029.1866080"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/2441776.2441915"},{"key":"e_1_3_2_1_21_1","unstructured":"Northern\u00a0German Broadcasting. 2023. Audio description guidelines. https:\/\/www.ndr.de\/fernsehen\/barrierefreie_angebote\/audiodeskription\/Audio-description-guidelines audiodeskription142.html. Accessed: 2023-04-09."},{"key":"e_1_3_2_1_22_1","volume-title":"DESCRIPTIVE VIDEO PRODUCTION AND PRESENTATION BEST PRACTICES GUIDE FOR DIGITAL ENVIRONMENTS","author":"Canada Media\u00a0Access","unstructured":"Media\u00a0Access Canada. 2023. DESCRIPTIVE VIDEO PRODUCTION AND PRESENTATION BEST PRACTICES GUIDE FOR DIGITAL ENVIRONMENTS. http:\/\/www.mediac.ca\/DVBPGDE_V2_28Feb2012.asp. Accessed: 2023-04-09."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3643834.3661556"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3526113.3545613"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3663548.3675599"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1049\/joe.2019.1137"},{"key":"e_1_3_2_1_27_1","volume-title":"YOLO-World: Real-Time Open-Vocabulary Object Detection. arXiv preprint arXiv:2401.17270","author":"Cheng Tianheng","year":"2024","unstructured":"Tianheng Cheng, Lin Song, Yixiao Ge, Wenyu Liu, Xinggang Wang, and Ying Shan. 2024. YOLO-World: Real-Time Open-Vocabulary Object Detection. arXiv preprint arXiv:2401.17270 (2024)."},{"key":"e_1_3_2_1_28_1","volume-title":"Described and Captioned Media Program (DCMP)","author":"Captioned\u00a0Media Program Described","unstructured":"Described and Captioned\u00a0Media Program. 2020. Described and Captioned Media Program (DCMP). http:\/\/www.descriptionkey.org\/quality_description.html. Accessed: 2019-03-19."},{"key":"e_1_3_2_1_29_1","volume-title":"SMERF: Streamable Memory Efficient Radiance Fields for Real-Time Large-Scene Exploration. arXiv preprint arXiv:2312.07541","author":"Duckworth Daniel","year":"2023","unstructured":"Daniel Duckworth, Peter Hedman, Christian Reiser, Peter Zhizhin, Jean-Fran\u00e7ois Thibert, Mario Lu\u010di\u0107, Richard Szeliski, and Jonathan\u00a0T Barron. 2023. SMERF: Streamable Memory Efficient Radiance Fields for Real-Time Large-Scene Exploration. arXiv preprint arXiv:2312.07541 (2023)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3373625.3417022"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3308558.3313605"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376728"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581249"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3597638.3608422"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.21307\/ijom-2017-057"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3517428.3544824"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606735"},{"key":"e_1_3_2_1_38_1","unstructured":"The Smith-Kettlewell Eye\u00a0Research Institute. 2022. YouDescribe. https:\/\/youdescribe.org\/"},{"key":"e_1_3_2_1_39_1","volume-title":"Understanding Blind and Low Vision People\u2019s Video Accessibility Preferences Across Viewing Scenarios. arXiv preprint arXiv:2403.10792","author":"Jiang Lucy","year":"2024","unstructured":"Lucy Jiang, Crescentia Jung, Mahika Phutane, Abigale Stangl, and Shiri Azenkot. 2024. \" It\u2019s Kind of Context Dependent\": Understanding Blind and Low Vision People\u2019s Video Accessibility Preferences Across Viewing Scenarios. arXiv preprint arXiv:2403.10792 (2024)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3597638.3608381"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376823"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3290605.3300282"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3334480.3382925"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"crossref","unstructured":"Elisa Kreiss Cynthia Bennett Shayan Hooshmand Eric Zelikman Meredith\u00a0Ringel Morris and Christopher Potts. 2022. Context Matters for Image Descriptions for Accessibility: Challenges for Referenceless Evaluation Metrics. arxiv:2205.10646\u00a0[cs.CL]","DOI":"10.18653\/v1\/2022.emnlp-main.309"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3580687"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445451"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3546714"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/2513383.2517033"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3501966"},{"key":"e_1_3_2_1_50_1","volume-title":"Conversations for Vision: Remote Sighted Assistants Helping People with Visual Impairments. arXiv preprint arXiv:1812.00148","author":"Lee Sooyeon","year":"2018","unstructured":"Sooyeon Lee, Madison Reddie, Krish Gurdasani, Xiying Wang, Jordan Beck, Mary\u00a0Beth Rosson, and John\u00a0M Carroll. 2018. Conversations for Vision: Remote Sighted Assistants Helping People with Visual Impairments. arXiv preprint arXiv:1812.00148 (2018)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376591"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376591"},{"key":"e_1_3_2_1_53_1","unstructured":"Chaojian Li Sixu Li Yang Zhao Wenbo Zhu and Yingyan Lin. 2022. RT-NeRF: Real-Time On-Device Neural Radiance Fields Towards Immersive AR\/VR Rendering. arxiv:2212.01120\u00a0[cs.AR]"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3526113.3545703"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00695"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/3173574.3173633"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/3597638.3608395"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1145\/3663548.3675617"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/3441852.3471201"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581023"},{"key":"e_1_3_2_1_62_1","unstructured":"American\u00a0Council of\u00a0the Blind. 2022. The Audio Description Project. https:\/\/adp.acb.org\/guidelines.html"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-020-10164-5"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1145\/3379337.3415864"},{"key":"e_1_3_2_1_65_1","volume-title":"Proceedings of Human Computer Interaction International (HCII) 71","author":"Petrie Helen","year":"2005","unstructured":"Helen Petrie, Chandra Harrison, and Sundeep Dev. 2005. Describing images on the web: a survey of current practice and prospects for the future. Proceedings of Human Computer Interaction International (HCII) 71, 2 (2005)."},{"key":"e_1_3_2_1_66_1","unstructured":"Audio\u00a0Description Project. 2023. Recommendation of the Federal Communications Commission disability...https:\/\/adp.acb.org\/docs\/DAC%20Recommendation%20on%20Audo%20Description%20Quality%20Adopted%20October%2014%202020.pdf"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1609\/hcomp.v5i1.13301"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"crossref","unstructured":"Elliot Salisbury Ece Kamar and Meredith\u00a0Ringel Morris. 2018. Evaluating and Complementing Vision-to-Language Technology for People who are Blind with Conversational Crowdsourcing.. In IJCAI. 5349\u20135353.","DOI":"10.24963\/ijcai.2018\/751"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00852"},{"key":"e_1_3_2_1_70_1","volume-title":"The craft of information visualization","author":"Shneiderman Ben","unstructured":"Ben Shneiderman. 2003. The eyes have it: A task by data type taxonomy for information visualizations. In The craft of information visualization. Elsevier, 364\u2013371."},{"key":"e_1_3_2_1_71_1","volume-title":"Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556","author":"Simonyan Karen","year":"2014","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)."},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1145\/3597638.3608402"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376404"},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1145\/3441852.3471233"},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.1109\/ROBIO54168.2021.9739520"},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00277"},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642839"},{"key":"e_1_3_2_1_78_1","doi-asserted-by":"publisher","DOI":"10.1145\/2384916.2384934"},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1145\/2651380"},{"key":"e_1_3_2_1_80_1","unstructured":"World Wide Web\u00a0Consortium (W3C). 2022. Audio Description or Media Alternative. https:\/\/www.w3.org\/TR\/2008\/REC-WCAG20-20081211\/#media-equiv-audio-desc"},{"key":"e_1_3_2_1_81_1","unstructured":"World Wide Web\u00a0Consortium (W3C). 2022. Providing a movie with extended audio descriptions. https:\/\/www.w3.org\/TR\/WCAG20-TECHS\/G8.html"},{"key":"e_1_3_2_1_82_1","unstructured":"World Wide Web\u00a0Consortium (W3C). 2022. W3C Image Concepts. https:\/\/www.w3.org\/WAI\/tutorials\/images\/"},{"key":"e_1_3_2_1_83_1","volume-title":"International Conference on Mobile and Ubiquitous Systems: Computing, Networking, and Services. Springer, 150\u2013168","author":"Yamanaka Yutaro","year":"2021","unstructured":"Yutaro Yamanaka, Seita Kayukawa, Hironobu Takagi, Yuichi Nagaoka, Yoshimune Hiratsuka, and Satoshi Kurihara. 2021. One-Shot Wayfinding Method for Blind People via OCR and Arrow Analysis with a 360-degree Smartphone Camera. In International Conference on Mobile and Ubiquitous Systems: Computing, Networking, and Services. Springer, 150\u2013168."},{"key":"e_1_3_2_1_84_1","volume-title":"Depth anything: Unleashing the power of large-scale unlabeled data. arXiv preprint arXiv:2401.10891","author":"Yang Lihe","year":"2024","unstructured":"Lihe Yang, Bingyi Kang, Zilong Huang, Xiaogang Xu, Jiashi Feng, and Hengshuang Zhao. 2024. Depth anything: Unleashing the power of large-scale unlabeled data. arXiv preprint arXiv:2401.10891 (2024)."},{"key":"e_1_3_2_1_85_1","doi-asserted-by":"publisher","DOI":"10.1145\/3334480.3382821"},{"key":"e_1_3_2_1_86_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20047-2_1"},{"key":"e_1_3_2_1_87_1","unstructured":"Yang Zhao Zhijie Lin Daquan Zhou Zilong Huang Jiashi Feng and Bingyi Kang. 2023. BuboGPT: Enabling Visual Grounding in Multi-Modal LLMs. arxiv:2307.08581\u00a0[cs.CV]"},{"key":"e_1_3_2_1_88_1","doi-asserted-by":"publisher","DOI":"10.1145\/2700648.2809865"},{"key":"e_1_3_2_1_89_1","doi-asserted-by":"publisher","DOI":"10.1145\/2971648.2971730"}],"event":{"name":"UIST '24: The 37th Annual ACM Symposium on User Interface Software and Technology","location":"Pittsburgh PA USA","acronym":"UIST '24"},"container-title":["Proceedings of the 37th Annual ACM Symposium on User Interface Software and Technology"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3654777.3676375","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3654777.3676375","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,4]],"date-time":"2025-08-04T21:11:01Z","timestamp":1754341861000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3654777.3676375"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,11]]},"references-count":89,"alternative-id":["10.1145\/3654777.3676375","10.1145\/3654777"],"URL":"https:\/\/doi.org\/10.1145\/3654777.3676375","relation":{},"subject":[],"published":{"date-parts":[[2024,10,11]]},"assertion":[{"value":"2024-10-11","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}