{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T20:07:55Z","timestamp":1776110875673,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":79,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,11]],"date-time":"2024-05-11T00:00:00Z","timestamp":1715385600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,11]]},"DOI":"10.1145\/3613904.3642632","type":"proceedings-article","created":{"date-parts":[[2024,5,11]],"date-time":"2024-05-11T08:38:25Z","timestamp":1715416705000},"page":"1-18","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":30,"title":["SPICA: Interactive Video Content Exploration through Augmented Audio Descriptions for Blind or Low-Vision Viewers"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7374-7453","authenticated-orcid":false,"given":"Zheng","family":"Ning","sequence":"first","affiliation":[{"name":"Department of Computer Science and Engineering, University of Notre Dame, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3821-5555","authenticated-orcid":false,"given":"Brianna L","family":"Wimer","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, University of Notre Dame, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-0483-693X","authenticated-orcid":false,"given":"Kaiwen","family":"Jiang","sequence":"additional","affiliation":[{"name":"Jacobs School of Engineering, University of California San Diego, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-2558-4823","authenticated-orcid":false,"given":"Keyi","family":"Chen","sequence":"additional","affiliation":[{"name":"Department of Mathematics, University of California San Diego, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-5620-3513","authenticated-orcid":false,"given":"Jerrick","family":"Ban","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, University of Notre Dame, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1423-4513","authenticated-orcid":false,"given":"Yapeng","family":"Tian","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Texas at Dallas, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3686-695X","authenticated-orcid":false,"given":"Yuhang","family":"Zhao","sequence":"additional","affiliation":[{"name":"Department of Computer Sciences, University of Wisconsin-Madison, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7902-7625","authenticated-orcid":false,"given":"Toby Jia-Jun","family":"Li","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, University of Notre Dame, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,5,11]]},"reference":[{"key":"e_1_3_3_3_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3355390"},{"key":"e_1_3_3_3_2_1","volume-title":"Musiclm: Generating music from text. arXiv preprint arXiv:2301.11325","author":"Agostinelli Andrea","year":"2023","unstructured":"Andrea Agostinelli, Timo\u00a0I Denk, Zal\u00e1n Borsos, Jesse Engel, Mauro Verzetti, Antoine Caillon, Qingqing Huang, Aren Jansen, Adam Roberts, Marco Tagliasacchi, 2023. Musiclm: Generating music from text. arXiv preprint arXiv:2301.11325 (2023)."},{"key":"e_1_3_3_3_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581107"},{"key":"e_1_3_3_3_4_1","volume-title":"Livedescribe: can amateur describers create high-quality audio description?Journal of Visual Impairment & Blindness 106, 3","author":"Branje J","year":"2012","unstructured":"Carmen\u00a0J Branje and Deborah\u00a0I Fels. 2012. Livedescribe: can amateur describers create high-quality audio description?Journal of Visual Impairment & Blindness 106, 3 (2012), 154\u2013165."},{"key":"e_1_3_3_3_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3272973.3273006"},{"key":"e_1_3_3_3_6_1","volume-title":"Qualitative research and content validity: developing best practices based on science and experience. Quality of life research 18","author":"Brod Meryl","year":"2009","unstructured":"Meryl Brod, Laura\u00a0E Tesler, and Torsten\u00a0L Christensen. 2009. Qualitative research and content validity: developing best practices based on science and experience. Quality of life research 18 (2009), 1263\u20131278."},{"key":"e_1_3_3_3_7_1","volume-title":"Web content accessibility guidelines (WCAG) 2.0","author":"Caldwell Ben","year":"2008","unstructured":"Ben Caldwell, Michael Cooper, Loretta\u00a0Guarino Reid, Gregg Vanderheiden, Wendy Chisholm, John Slatin, and Jason White. 2008. Web content accessibility guidelines (WCAG) 2.0. WWW Consortium (W3C) 290 (2008), 1\u201334."},{"key":"e_1_3_3_3_8_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10209-018-0634-4"},{"key":"e_1_3_3_3_9_1","doi-asserted-by":"publisher","DOI":"10.1080\/0907676X.2015.1120760"},{"key":"e_1_3_3_3_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/2935334.2935386"},{"key":"e_1_3_3_3_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3597638.3608411"},{"key":"e_1_3_3_3_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3015783.3015792"},{"key":"e_1_3_3_3_13_1","doi-asserted-by":"publisher","DOI":"10.1556\/Acr.15.2014.1.6"},{"key":"e_1_3_3_3_14_1","volume-title":"An introduction to audio description: A practical guide","author":"Fryer Louise","unstructured":"Louise Fryer. 2016. An introduction to audio description: A practical guide. Routledge."},{"key":"e_1_3_3_3_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/2557595.2557599"},{"key":"e_1_3_3_3_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2010.5543575"},{"key":"e_1_3_3_3_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10209-008-0141-0"},{"key":"e_1_3_3_3_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3373625.3417027"},{"key":"e_1_3_3_3_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"e_1_3_3_3_20_1","volume-title":"A cognitive approach to audio description. Researching audio description: New approaches","author":"Holsanova Jana","year":"2016","unstructured":"Jana Holsanova. 2016. A cognitive approach to audio description. Researching audio description: New approaches (2016), 49\u201373."},{"key":"e_1_3_3_3_21_1","volume-title":"A cognitive approach to audio description: production and reception processes","author":"Holsanova Jana","unstructured":"Jana Holsanova. 2022. A cognitive approach to audio description: production and reception processes. In The Routledge Handbook of Audio Description. Routledge, 57\u201377."},{"key":"e_1_3_3_3_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581494"},{"key":"e_1_3_3_3_23_1","doi-asserted-by":"publisher","DOI":"10.1080\/10447318.2020.1726107"},{"key":"e_1_3_3_3_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01798"},{"key":"e_1_3_3_3_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544549.3585610"},{"key":"e_1_3_3_3_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606830"},{"key":"e_1_3_3_3_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/1414471.1414487"},{"key":"e_1_3_3_3_28_1","volume-title":"Global-local path networks for monocular depth estimation with vertical cutdepth. arXiv preprint arXiv:2201.07436","author":"Kim Doyeon","year":"2022","unstructured":"Doyeon Kim, Woonghyun Ka, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, and Junmo Kim. 2022. Global-local path networks for monocular depth estimation with vertical cutdepth. arXiv preprint arXiv:2201.07436 (2022)."},{"key":"e_1_3_3_3_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/1639642.1639699"},{"key":"e_1_3_3_3_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/1878803.1878833"},{"key":"e_1_3_3_3_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3501966"},{"key":"e_1_3_3_3_32_1","volume-title":"International Conference on Machine Learning. PMLR, 12888\u201312900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International Conference on Machine Learning. PMLR, 12888\u201312900."},{"key":"e_1_3_3_3_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3025453.3025483"},{"key":"e_1_3_3_3_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3379337.3415820"},{"key":"e_1_3_3_3_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/VLHCC.2018.8506506"},{"key":"e_1_3_3_3_36_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_3_3_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445233"},{"key":"e_1_3_3_3_38_1","volume-title":"Proceedings of the 35th Annual ACM Symposium on User Interface Software and Technology. 1\u201314","author":"Wang Ruolin","year":"2022","unstructured":"Xingyu\"\u00a0Bruce\" Liu, Ruolin Wang, Dingzeyu Li, Xiang\u00a0Anthony Chen, and Amy Pavel. 2022. CrossA11y: Identifying Video Accessibility Issues via Cross-modal Grounding. In Proceedings of the 35th Annual ACM Symposium on User Interface Software and Technology. 1\u201314."},{"key":"e_1_3_3_3_39_1","series-title":"New Series\u2013Themes in Translation Studies 6 (2007)","volume-title":"Designing a course on audio description and defining the main competences of the future professional. Linguistica Antverpiensia","author":"Matamala Anna","unstructured":"Anna Matamala and Pilar Orero. 2007. Designing a course on audio description and defining the main competences of the future professional. Linguistica Antverpiensia, New Series\u2013Themes in Translation Studies 6 (2007)."},{"key":"e_1_3_3_3_40_1","doi-asserted-by":"publisher","DOI":"10.1080\/14781700.2014.943678"},{"key":"e_1_3_3_3_41_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10209-019-00668-9"},{"key":"e_1_3_3_3_42_1","volume-title":"https:\/\/youdescribe.org","author":"Miele Joshua","year":"2013","unstructured":"Dr.\u00a0Joshua Miele. 2013. YouDescribe. (2013). https:\/\/youdescribe.org"},{"key":"e_1_3_3_3_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3173574.3173633"},{"key":"e_1_3_3_3_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3373625.3418030"},{"key":"e_1_3_3_3_45_1","volume-title":"ECCV 2022 Workshop","author":"Ning Zheng","year":"2022","unstructured":"Zheng Ning, Zheng Zhang, Jerrick Ban, Kaiwen Jiang, Ruohong Gan, Yapeng Tian, and Toby Jia-Jun Li. 2022. MIMOSA: Human-in-the-Loop Generation of Spatial Audio from Videos with Monaural Audio. AV4D: Visual Learning of Sounds in Spaces, ECCV 2022 Workshop (2022)."},{"key":"e_1_3_3_3_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/319382.319398"},{"key":"e_1_3_3_3_47_1","doi-asserted-by":"publisher","DOI":"10.1177\/0145482X1510900204"},{"key":"e_1_3_3_3_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3379337.3415864"},{"key":"e_1_3_3_3_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3441852.3471234"},{"key":"e_1_3_3_3_50_1","volume-title":"A comparative study of audio description guidelines prevalent in different countries","author":"Rai Sonali","year":"2010","unstructured":"Sonali Rai, Joan Greening, and Leen Petr\u00e9. 2010. A comparative study of audio description guidelines prevalent in different countries. Londra: Royal National Institute of Blind People (2010)."},{"key":"e_1_3_3_3_51_1","volume-title":"Sentence-bert: Sentence embeddings using siamese bert-networks. arXiv preprint arXiv:1908.10084","author":"Reimers Nils","year":"2019","unstructured":"Nils Reimers and Iryna Gurevych. 2019. Sentence-bert: Sentence embeddings using siamese bert-networks. arXiv preprint arXiv:1908.10084 (2019)."},{"key":"e_1_3_3_3_52_1","unstructured":"Aline Remael Nina Reviers and Gert Vercauteren. 2015. Pictures painted in words: ADLAB audio description guidelines. (2015). https:\/\/dcmp.org\/learn\/captioningkey\/624"},{"key":"e_1_3_3_3_53_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41598-020-68253-2"},{"key":"e_1_3_3_3_54_1","first-page":"657","article-title":"Color and contrast in E-Learning design: A review of the literature and recommendations for instructional designers and web developers","volume":"10","author":"Richardson T","year":"2014","unstructured":"Rick\u00a0T Richardson, Tara\u00a0L Drexler, and Donna\u00a0M Delparte. 2014. Color and contrast in E-Learning design: A review of the literature and recommendations for instructional designers and web developers. MERLOT Journal of Online Learning and Teaching 10, 4 (2014), 657\u2013670.","journal-title":"MERLOT Journal of Online Learning and Teaching"},{"key":"e_1_3_3_3_55_1","doi-asserted-by":"publisher","DOI":"10.1609\/hcomp.v5i1.13301"},{"key":"e_1_3_3_3_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/1969289.1969299"},{"key":"e_1_3_3_3_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/3379337.3415848"},{"key":"e_1_3_3_3_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/CSCI.2014.116"},{"key":"e_1_3_3_3_59_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ics.2005.05.215"},{"key":"e_1_3_3_3_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376404"},{"key":"e_1_3_3_3_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3290605.3300833"},{"key":"e_1_3_3_3_62_1","unstructured":"Terril Thompson. 2017. My Audio Description Talk@ CSUN."},{"key":"e_1_3_3_3_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00277"},{"key":"e_1_3_3_3_64_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58580-8_26"},{"key":"e_1_3_3_3_65_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_16"},{"key":"e_1_3_3_3_66_1","volume-title":"Media for all","author":"Vercauteren Gert","unstructured":"Gert Vercauteren. 2007. Towards a European guideline for audio description. In Media for all. Brill, 139\u2013149."},{"key":"e_1_3_3_3_67_1","volume-title":"International Conference on Machine Learning. PMLR, 23318\u201323340","author":"Wang Peng","year":"2022","unstructured":"Peng Wang, An Yang, Rui Men, Junyang Lin, Shuai Bai, Zhikang Li, Jianxin Ma, Chang Zhou, Jingren Zhou, and Hongxia Yang. 2022. Ofa: Unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. In International Conference on Machine Learning. PMLR, 23318\u201323340."},{"key":"e_1_3_3_3_68_1","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445347"},{"key":"e_1_3_3_3_69_1","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445347"},{"key":"e_1_3_3_3_70_1","volume-title":"AAAI\u201998 Workshop: Representations for Multi-modal Human-Computer Interaction. 6\u2013p.","author":"Wolff Fr\u00e9d\u00e9ric","year":"1998","unstructured":"Fr\u00e9d\u00e9ric Wolff, Antonella De\u00a0Angeli, and Laurent Romary. 1998. Acting on a visual world: The role of perception in multimodal HCI. In AAAI\u201998 Workshop: Representations for Multi-modal Human-Computer Interaction. 6\u2013p."},{"key":"e_1_3_3_3_71_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01363"},{"key":"e_1_3_3_3_72_1","doi-asserted-by":"publisher","DOI":"10.1145\/2998181.2998364"},{"key":"e_1_3_3_3_73_1","volume-title":"Color improves object recognition in normal and low vision.Journal of Experimental Psychology: Human perception and performance 19, 4","author":"Wurm H","year":"1993","unstructured":"Lee\u00a0H Wurm, Gordon\u00a0E Legge, Lisa\u00a0M Isenberg, and Andrew Luebker. 1993. Color improves object recognition in normal and low vision.Journal of Experimental Psychology: Human perception and performance 19, 4 (1993), 899."},{"key":"e_1_3_3_3_74_1","volume-title":"HeritageSite AR: An Exploration Game for Quality Education and Sustainable Cultural Heritage. Extended Abstracts of the 2023 CHI Conference on Human Factors in Computing Systems","author":"Xu Ningning","year":"2023","unstructured":"Ningning Xu, Jia\u00a0Wen Liang, Kexiang Shuai, Yuwen Li, and Jiaqi Yan. 2023. HeritageSite AR: An Exploration Game for Quality Education and Sustainable Cultural Heritage. Extended Abstracts of the 2023 CHI Conference on Human Factors in Computing Systems (2023). https:\/\/api.semanticscholar.org\/CorpusID:258217164"},{"key":"e_1_3_3_3_75_1","doi-asserted-by":"publisher","DOI":"10.1145\/3357236.3395433"},{"key":"e_1_3_3_3_76_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00721"},{"key":"e_1_3_3_3_77_1","volume-title":"PEANUT: A Human-AI Collaborative Tool for Annotating Audio-Visual Data. arXiv preprint arXiv:2307.15167","author":"Zhang Zheng","year":"2023","unstructured":"Zheng Zhang, Zheng Ning, Chenliang Xu, Yapeng Tian, and Toby Jia-Jun Li. 2023. PEANUT: A Human-AI Collaborative Tool for Annotating Audio-Visual Data. arXiv preprint arXiv:2307.15167 (2023)."},{"key":"e_1_3_3_3_78_1","doi-asserted-by":"publisher","DOI":"10.1145\/2971648.2971730"},{"key":"e_1_3_3_3_79_1","doi-asserted-by":"publisher","DOI":"10.1145\/2702123.2702437"}],"event":{"name":"CHI '24: CHI Conference on Human Factors in Computing Systems","location":"Honolulu HI USA","acronym":"CHI '24","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction","SIGACCESS ACM Special Interest Group on Accessible Computing"]},"container-title":["Proceedings of the CHI Conference on Human Factors in Computing Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3613904.3642632","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3613904.3642632","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T23:56:56Z","timestamp":1750291016000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3613904.3642632"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,11]]},"references-count":79,"alternative-id":["10.1145\/3613904.3642632","10.1145\/3613904"],"URL":"https:\/\/doi.org\/10.1145\/3613904.3642632","relation":{},"subject":[],"published":{"date-parts":[[2024,5,11]]},"assertion":[{"value":"2024-05-11","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}