{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T19:21:25Z","timestamp":1776108085568,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":22,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,23]]},"DOI":"10.1145\/3707640.3731916","type":"proceedings-article","created":{"date-parts":[[2025,6,20]],"date-time":"2025-06-20T12:23:03Z","timestamp":1750422183000},"page":"1-7","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["YouDescribe: Bridging AI Efficiency and Human Insight for Scalable Audio Description"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-2760-3632","authenticated-orcid":false,"given":"Lana","family":"Do","sequence":"first","affiliation":[{"name":"Northeastern University, San Jose, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-4391-0016","authenticated-orcid":false,"given":"Sanjay","family":"Mirani","sequence":"additional","affiliation":[{"name":"San Francisco State University, San Francisco, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-9013-9341","authenticated-orcid":false,"given":"Charity","family":"Pitcher-Cooper","sequence":"additional","affiliation":[{"name":"Smith-Kettlewell Eye Research Institute, San Francisco, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7605-7922","authenticated-orcid":false,"given":"Xuan Duy Anh","family":"Nguyen","sequence":"additional","affiliation":[{"name":"San Francisco State University, San Francisco, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-3874-7699","authenticated-orcid":false,"given":"Alekya","family":"Bairaboina","sequence":"additional","affiliation":[{"name":"San Francisco State University, San Francisco, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2418-5287","authenticated-orcid":false,"given":"Ilmi","family":"Yoon","sequence":"additional","affiliation":[{"name":"Northeastern University, San Jose, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,6,22]]},"reference":[{"key":"e_1_3_3_1_2_2","unstructured":"[n. d.]. The OpenCV Library | Dr Dobb\u2019s. https:\/\/www.drdobbs.com\/open-source\/the-opencv-library\/184404319"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"crossref","unstructured":"Aditya Bodi Pooyan Fazli Shasta Ihorn Yue\u00a0Ting Siu Andrew\u00a0T. Scott Lothar Narins Yash Kant Abhishek Das and Ilmi Yoon. 2021. Automated Video Description for Blind and Low Vision Users. Conference on Human Factors in Computing Systems - Proceedings (5 2021). https:\/\/doi.org\/10.1145\/3411763.3451810","DOI":"10.1145\/3411763.3451810"},{"key":"e_1_3_3_1_4_2","volume-title":"LiveDescribe: Can Amateur Describers Create High-Quality Audio Description?","author":"Branje Carmen\u00a0J","unstructured":"Carmen\u00a0J Branje and Deborah\u00a0I Fels\u00a0Structured. [n. d.]. LiveDescribe: Can Amateur Describers Create High-Quality Audio Description?Technical Report."},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"crossref","unstructured":"Ruei\u00a0Che Chang Chao\u00a0Hsien Ting Chia\u00a0Sheng Hung Wan\u00a0Chen Lee Liang\u00a0Jin Chen Yu\u00a0Tzu Chao Bing\u00a0Yu Chen and Anhong Guo. 2022. OmniScribe: Authoring Immersive Audio Descriptions for 360\u00b0 Videos. UIST 2022 - Proceedings of the 35th Annual ACM Symposium on User Interface Software and Technology (10 2022). https:\/\/doi.org\/10.1145\/3526113.3545613","DOI":"10.1145\/3526113.3545613"},{"key":"e_1_3_3_1_6_2","unstructured":"Maryam Cheema Hasti Seifi and Pooyan Fazli. 2024. Describe Now: User-Driven Audio Description for Blind and Low Vision Individuals. (11 2024). http:\/\/arxiv.org\/abs\/2411.11835"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"crossref","unstructured":"Agnieszka Chmiel and Iwona Mazur. 2022. A homogenous or heterogeneous audience? Audio description preferences of persons with congenital blindness non-congenital blindness and low vision. Perspectives: Studies in Translation Theory and Practice 30 3 (2022) 552\u2013567. https:\/\/doi.org\/10.1080\/0907676X.2021.1913198","DOI":"10.1080\/0907676X.2021.1913198"},{"key":"e_1_3_3_1_8_2","unstructured":"Peng Chu Jiang Wang and Andre Abrantes. 2024. LLM-AD: Large Language Model based Audio Description System. (5 2024). https:\/\/arxiv.org\/abs\/2405.00983v1"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"crossref","unstructured":"Nazaret Fresno Judit Castell\u00e0 and Olga Soler-Vilageliu. 2016. \u2018What Should I Say?\u2019 Tentative Criteria to Prioritize Information in the Audio Description of Film Characters. Researching Audio Description (2016) 143\u2013167. https:\/\/doi.org\/10.1057\/978-1-137-56917-2_8","DOI":"10.1057\/978-1-137-56917-2_8"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"crossref","unstructured":"Lucy Jiang and Richard Ladner. 2022. Co-Designing Systems to Support Blind and Low Vision Audio Description Writers. ASSETS 2022 - Proceedings of the 24th International ACM SIGACCESS Conference on Computers and Accessibility (10 2022). https:\/\/doi.org\/10.1145\/3517428.3550394","DOI":"10.1145\/3517428.3550394"},{"key":"e_1_3_3_1_11_2","unstructured":"Junnan Li Dongxu Li S. Savarese and Steven C.\u00a0H. Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. International Conference on Machine Learning (2023). https:\/\/doi.org\/10.48550\/ARXIV.2301.12597"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"crossref","unstructured":"Mala\u00a0D. Naraine Deborah\u00a0I. Fels and Margot Whitfield. 2018. Impacts on quality: Enjoyment factors in blind and low vision audience entertainment ratings: A qualitative study. PLoS ONE 13 12 (12 2018). https:\/\/doi.org\/10.1371\/journal.pone.0208165","DOI":"10.1371\/journal.pone.0208165"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"crossref","unstructured":"Rosiana Natalie Ebrima Jarjue Hernisa Kacorri and Kotaro Hara. 2020. ViScene: A Collaborative Authoring Tool for Scene Descriptions in Videos. ASSETS. ACM Conference on Assistive Technologies 2020 (10 2020) 87. https:\/\/doi.org\/10.1145\/3373625.3418030","DOI":"10.1145\/3373625.3418030"},{"key":"e_1_3_3_1_14_2","volume-title":"An Overview of Video Description: History, Benefits, and Guidelines","author":"Packer Jaclyn","unstructured":"Jaclyn Packer, Katie Vizenor, and Joshua\u00a0A Miele. [n. d.]. An Overview of Video Description: History, Benefits, and Guidelines. Technical Report."},{"key":"e_1_3_3_1_15_2","unstructured":"Charity Pitcher-Cooper Manali Seth Benjamin Kao James\u00a0M Coughlan and Ilmi Yoon. 2023. You Described We Archived: A Rich Audio Description Dataset. Journal on Technology and Persons with Disabilities 11 (2023). https:\/\/youdescribe.org\/"},{"key":"e_1_3_3_1_16_2","unstructured":"Joseph Redmon and Ali Farhadi. 2018. YOLOv3: An Incremental Improvement. (4 2018). https:\/\/arxiv.org\/abs\/1804.02767v1"},{"key":"e_1_3_3_1_17_2","volume-title":"Adding Audio Description: Does It Make a Difference?","author":"Schmeidler Emilie","year":"2000","unstructured":"Emilie Schmeidler, Corinne Kirchner, Katharine Bond, Laurie Everett, Jaclyn Packer, Lawrence Scadden, Joel Snyder, and Karen Wolffe. 2000. Adding Audio Description: Does It Make a Difference?Technical Report."},{"key":"e_1_3_3_1_18_2","unstructured":"TomarSuramya. 2006. Converting video formats with FFmpeg. Linux Journal (6 2006). https:\/\/doi.org\/10.5555\/1134782.1134792"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"crossref","unstructured":"Tess Van\u00a0Daele Akhil Iyer Yuning Zhang Jalyn\u00a0C. Derry Mina Huh and Amy Pavel. 2024. Making Short-Form Videos Accessible with Hierarchical Video Summaries. Conference on Human Factors in Computing Systems - Proceedings 1 (2 2024) 17. https:\/\/doi.org\/10.1145\/3613904.3642839","DOI":"10.1145\/3613904.3642839"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"crossref","unstructured":"Yujia Wang and Wei Liang. 2021. Toward automatic audio description generation for accessible videos. Conference on Human Factors in Computing Systems - Proceedings (5 2021). https:\/\/doi.org\/10.1145\/3411764.3445347\/SUPPL_FILE\/3411764.3445347_VIDEOPREVIEW.MP4","DOI":"10.1145\/3411764.3445347"},{"key":"e_1_3_3_1_21_2","first-page":"11415","volume-title":"MMAD: Multi-modal Movie Audio Description","author":"Ye Xiaojun","year":"2024","unstructured":"Xiaojun Ye, Junhao Chen, Xiang Li, Haidong Xin, Chao Li, Sheng Zhou, and Jiajun Bu. 2024. MMAD: Multi-modal Movie Audio Description. Technical Report. 11415 pages. https:\/\/github.com\/Daria8976\/MMAD."},{"key":"e_1_3_3_1_22_2","unstructured":"YouDescribe. [n. d.]. YouDescribe. Accessed Date 2025-03-08. https:\/\/www.youdescribe.org\/."},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","DOI":"10.1145\/3357236.3395433"}],"event":{"name":"CHIWORK '25 Adjunct: Adjunct Proceedings of the 4th Annual Symposium on Human-Computer Interaction for Work","location":"Amsterdam Netherlands","acronym":"CHIWORK '25 Adjunct"},"container-title":["Adjunct Proceedings of the 4th Annual Symposium on Human-Computer Interaction for Work"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3707640.3731916","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,20]],"date-time":"2025-06-20T12:41:41Z","timestamp":1750423301000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3707640.3731916"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,22]]},"references-count":22,"alternative-id":["10.1145\/3707640.3731916","10.1145\/3707640"],"URL":"https:\/\/doi.org\/10.1145\/3707640.3731916","relation":{},"subject":[],"published":{"date-parts":[[2025,6,22]]},"assertion":[{"value":"2025-06-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}