{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,13]],"date-time":"2026-06-13T13:40:50Z","timestamp":1781358050363,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":91,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,4,25]],"date-time":"2025-04-25T00:00:00Z","timestamp":1745539200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000002","name":"National Institutes of Health","doi-asserted-by":"publisher","award":["R01EY034562"],"award-info":[{"award-number":["R01EY034562"]}],"id":[{"id":"10.13039\/100000002","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,4,26]]},"DOI":"10.1145\/3706598.3714096","type":"proceedings-article","created":{"date-parts":[[2025,4,24]],"date-time":"2025-04-24T03:30:09Z","timestamp":1745465409000},"page":"1-29","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":8,"title":["VideoA11y: Method and Dataset for Accessible Video Description"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0788-0189","authenticated-orcid":false,"given":"Chaoyu","family":"Li","sequence":"first","affiliation":[{"name":"School of Computing and Augmented Intelligence, Arizona State University, Tempe, Arizona, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6568-8139","authenticated-orcid":false,"given":"Sid","family":"Padmanabhuni","sequence":"additional","affiliation":[{"name":"School of Computing and Augmented Intelligence, Arizona State University, Tempe, Arizona, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0556-8029","authenticated-orcid":false,"given":"Maryam S","family":"Cheema","sequence":"additional","affiliation":[{"name":"School of Computing and Augmented Intelligence, Arizona State University, Tempe, Arizona, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6437-0463","authenticated-orcid":false,"given":"Hasti","family":"Seifi","sequence":"additional","affiliation":[{"name":"School of Computing and Augmented Intelligence, Arizona State University, Tempe, Arizona, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2625-8216","authenticated-orcid":false,"given":"Pooyan","family":"Fazli","sequence":"additional","affiliation":[{"name":"School of Arts, Media and Engineering, Arizona State University, Tempe, Arizona, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,4,25]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"crossref","unstructured":"Nayyer Aafaq Ajmal Mian Wei Liu Syed\u00a0Zulqarnain Gilani and Mubarak Shah. 2019. Video Description: A Survey of Methods Datasets and Evaluation Metrics. ACM Computing Surveys (CSUR) 52 6 (2019) 1\u201337.","DOI":"10.1145\/3355390"},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"e_1_3_3_2_4_2","volume-title":"Asian Conference on Computer Vision (ACCV)","author":"Bain Max","year":"2020","unstructured":"Max Bain, Arsha Nagrani, Andrew Brown, and Andrew Zisserman. 2020. Condensed Movies: Story Based Retrieval with Contextual Embeddings. In Asian Conference on Computer Vision (ACCV)."},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581107"},{"key":"e_1_3_3_2_7_2","volume-title":"About the ICT Accessibility 508 Standards and 255 Guidelines","author":"Board US\u00a0Access","year":"2024","unstructured":"US\u00a0Access Board. 2024. About the ICT Accessibility 508 Standards and 255 Guidelines. Retrieved Feb. 18, 2025 from https:\/\/www.access-board.gov\/ict\/"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.1145\/3411763.3451810"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"crossref","unstructured":"Carmen\u00a0J. Branje and Deborah\u00a0I. Fels. 2012. LiveDescribe: Can Amateur Describers Create High-Quality Audio Description? Journal of Visual Impairment & Blindness (JVIB)3 (2012) 154\u2013165.","DOI":"10.1177\/0145482X1210600304"},{"key":"e_1_3_3_2_10_2","first-page":"1877","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared\u00a0D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel Ziegler, Jeffrey Wu, Clemens Winter, Chris Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020. Language Models are Few-Shot Learners. In Advances in Neural Information Processing Systems (NeurIPS). 1877\u20131901."},{"key":"e_1_3_3_2_11_2","first-page":"3621","volume-title":"24th Chinese Control and Decision Conference (CCDC)","author":"Cao Changqing","year":"2012","unstructured":"Changqing Cao, Zehua Chen, Gang Xie, and Shaoshuai Lei. 2012. Key Frame Extraction Based on Frame Blocks Differential Accumulation. In 24th Chinese Control and Decision Conference (CCDC). 3621\u20133625."},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","DOI":"10.1145\/3526113.3545613"},{"key":"e_1_3_3_2_13_2","unstructured":"Maryam Cheema Hasti Seifi and Pooyan Fazli. 2024. Describe Now: User-Driven Audio Description for Blind and Low Vision Individuals. arXiv:2411.11835 [cs.HC]."},{"key":"e_1_3_3_2_14_2","volume-title":"Annual Meeting of the Association for Computational Linguistics (ACL)","author":"Chen David\u00a0L.","year":"2011","unstructured":"David\u00a0L. Chen and William\u00a0B. Dolan. 2011. Collecting Highly Parallel Data for Paraphrase Evaluation. In Annual Meeting of the Association for Computational Linguistics (ACL)."},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"crossref","unstructured":"Haoran Chen Jianmin Li Simone Frintrop and Xiaolin Hu. 2022. The MSR-Video to Text Dataset with Clean Annotations. Computer Vision and Image Understanding (CVIU) (2022) 103581.","DOI":"10.1016\/j.cviu.2022.103581"},{"key":"e_1_3_3_2_16_2","unstructured":"Sihan Chen Xingjian He Longteng Guo Xinxin Zhu Weining Wang Jinhui Tang and Jing Liu. 2023. VALOR: Vision-Audio-Language Omni-Perception Pretraining Model and Dataset. arXiv:2304.08345 [cs.LG]."},{"key":"e_1_3_3_2_17_2","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","author":"Chen Sihan","year":"2023","unstructured":"Sihan Chen, Handong Li, Qunbo Wang, Zijia Zhao, Mingzhen Sun, Xinxin Zhu, and Jing Liu. 2023. VAST: A Vision-Audio-Subtitle-Text Omni-Modality Foundation Model and Dataset. In Advances in Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01265"},{"key":"e_1_3_3_2_19_2","unstructured":"Cheng-Yu Chuang and Pooyan Fazli. 2023. CLearViD: Curriculum Learning for Video Description. arXiv:2311.04480 [cs.CV]."},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"crossref","unstructured":"Lalit Dandona and Rakhi Dandona. 2006. Revision of visual impairment definitions in the International Statistical Classification of Diseases. BMC Medicine 4 (2006).","DOI":"10.1186\/1741-7015-4-7"},{"key":"e_1_3_3_2_21_2","volume-title":"Description Key - Quality Description","year":"2024","unstructured":"DCMP. 2024. Description Key - Quality Description. Retrieved Feb. 18, 2025 from https:\/\/dcmp.org\/learn\/621-description-key\u2014quality-description"},{"key":"e_1_3_3_2_22_2","first-page":"4171","volume-title":"Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT)","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT). 4171\u20134186."},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i10.29066"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICITBE54178.2021.00032"},{"key":"e_1_3_3_2_25_2","volume-title":"Twenty-First Century Communications and Video Accessibility Act","author":"(FCC) Federal Communications Commission","year":"2024","unstructured":"Federal Communications Commission (FCC). 2024. Twenty-First Century Communications and Video Accessibility Act. Retrieved Feb. 18, 2025 from https:\/\/www.fcc.gov\/cvaa"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"publisher","DOI":"10.1057\/978-1-137-56917-2_8"},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00784"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"crossref","unstructured":"Bin Huang Xin Wang Hong Chen Zihan Song and Wenwu Zhu. 2023. VTimeLLM: Empower LLM to Grasp Video Moments. arXiv:2311.18445 [cs.CV].","DOI":"10.1109\/CVPR52733.2024.01353"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.aacl-main.48"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58548-8_41"},{"key":"e_1_3_3_2_31_2","volume-title":"The Twelfth International Conference on Learning Representations (ICLR)","author":"Huang Xinyu","year":"2024","unstructured":"Xinyu Huang, Youcai Zhang, Jinyu Ma, Weiwei Tian, Rui Feng, Yuejie Zhang, Yaqian Li, Yandong Guo, and Lei Zhang. 2024. Tag2Text: Guiding Vision-Language Model via Image Tagging. In The Twelfth International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_2_32_2","unstructured":"Shasta Ihorn Yue-Ting Siu Aditya Bodi Lothar Narins Jose\u00a0M Castanon Yash Kant Abhishek Das Ilmi Yoon and Pooyan Fazli. 2021. NarrationBot and InfoBot: A Hybrid System for Automated Video Description. arXiv:2111.03994 [cs.CV]."},{"key":"e_1_3_3_2_33_2","volume-title":"ACM SIGCHI Conference on Human Factors in Computing Systems (CHI)","author":"Jiang Lucy","year":"2024","unstructured":"Lucy Jiang, Crescentia Jung, Mahika Phutane, Abigale Stangl, and Shiri Azenkot. 2024. \u201cIt\u2019s Kind of Context Dependent\u201d: Understanding Blind and Low Vision People\u2019s Video Accessibility Preferences Across Viewing Scenarios. In ACM SIGCHI Conference on Human Factors in Computing Systems (CHI)."},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"crossref","unstructured":"Jan-Christoph Klie Bonnie Webber and Iryna Gurevych. 2023. Annotation Error Detection: Analyzing the Past and Present for a More Coherent Future. Computational Linguistics (2023) 157\u2013198.","DOI":"10.1162\/coli_a_00464"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","DOI":"10.1145\/1639642.1639699"},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"publisher","DOI":"10.1145\/1878803.1878833"},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"publisher","DOI":"10.3115\/1626355.1626389"},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.233"},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58589-1_27"},{"key":"e_1_3_3_2_41_2","unstructured":"Bo Li Yuanhan Zhang Dong Guo Renrui Zhang Feng Li Hao Zhang Kaichen Zhang Yanwei Li Ziwei Liu and Chunyuan Li. 2024. LLaVA-OneVision: Easy Visual Task Transfer. arXiv:2408.03326 [cs.CV]."},{"key":"e_1_3_3_2_42_2","unstructured":"Chaoyu Li Eun\u00a0Woo Im and Pooyan Fazli. 2024. VidHalluc: Evaluating Temporal Hallucinations in Multimodal Large Language Models for Video Understanding. arXiv:2412.03735 [cs.CV]."},{"key":"e_1_3_3_2_43_2","volume-title":"40th International Conference on Machine Learning (ICML)","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. In 40th International Conference on Machine Learning (ICML)."},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"crossref","unstructured":"Bin Lin Yang Ye Bin Zhu Jiaxi Cui Munan Ning Peng Jin and Li Yuan. 2023. Video-LLaVA: Learning United Visual Representation by Alignment Before Projection. arXiv:2311.10122 [cs.CV].","DOI":"10.18653\/v1\/2024.emnlp-main.342"},{"key":"e_1_3_3_2_45_2","first-page":"74","volume-title":"Association for Computational Linguistics (ACL)","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. ROUGE: A Package for Automatic Evaluation of Summaries. In Association for Computational Linguistics (ACL). 74\u201381."},{"key":"e_1_3_3_2_46_2","unstructured":"Jingyang Lin Hang Hua Ming Chen Yikang Li Jenhao Hsiao Chiuman Ho and Jiebo Luo. 2023. VideoXum: Cross-modal Visual and Textural Summarization of Videos. IEEE Transactions on Multimedia (TMM) (2023)."},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02520"},{"key":"e_1_3_3_2_48_2","doi-asserted-by":"publisher","DOI":"10.1145\/3526113.3545703"},{"key":"e_1_3_3_2_49_2","unstructured":"Yi Liu Gelei Deng Yuekang Li Kailong Wang Tianwei Zhang Yepang Liu Haoyu Wang Yan Zheng and Yang Liu. 2023. Prompt Injection attack against LLM-integrated Applications. arXiv:2306.05499 [cs.CV]."},{"key":"e_1_3_3_2_50_2","volume-title":"European Conference on Computer Vision (ECCV)","author":"Liu Yuan","year":"2024","unstructured":"Yuan Liu, Haodong Duan, Yuanhan Zhang, Bo Li, Songyang Zhang, Wangbo Zhao, Yike Yuan, Jiaqi Wang, Conghui He, Ziwei Liu, Kai Chen, and Dahua Lin. 2024. MMBench: Is Your Multi-modal Model an All-around Player?. In European Conference on Computer Vision (ECCV)."},{"key":"e_1_3_3_2_51_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"e_1_3_3_2_52_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00272"},{"key":"e_1_3_3_2_53_2","volume-title":"Media Access Canada (MAC) - Our Projects - Descriptive Video Production and Presentation Best Practices Guide for Digital Environments","author":"Milligan B.","year":"2012","unstructured":"B. Milligan and D. Fels. 2012. Media Access Canada (MAC) - Our Projects - Descriptive Video Production and Presentation Best Practices Guide for Digital Environments. Retrieved Feb. 18, 2025 from http:\/\/www.mediac.ca\/DVBPGDE_V2_28Feb2012.asp"},{"key":"e_1_3_3_2_54_2","doi-asserted-by":"crossref","unstructured":"Valerie\u00a0S. Morash Yue-Ting Siu Joshua\u00a0A. Miele Lucia Hasty and Steven Landau. 2015. Guiding Novice Web Workers in Making Image Descriptions Using Templates. ACM Transactions on Accessible Computing (TACCESS) (2015).","DOI":"10.1145\/2764916"},{"key":"e_1_3_3_2_55_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19781-9_24"},{"key":"e_1_3_3_2_56_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491101.3503814"},{"key":"e_1_3_3_2_57_2","doi-asserted-by":"publisher","DOI":"10.1145\/3663548.3675617"},{"key":"e_1_3_3_2_58_2","doi-asserted-by":"publisher","DOI":"10.1145\/3441852.3471201"},{"key":"e_1_3_3_2_59_2","volume-title":"Audio Description Style Guide v2.5","year":"2024","unstructured":"Netflix. 2024. Audio Description Style Guide v2.5. Retrieved Feb. 18, 2025 from https:\/\/partnerhelp.netflixstudios.com\/hc\/en-us\/articles\/215510667-Audio-Description-Style-Guide-v2-5"},{"key":"e_1_3_3_2_60_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-naacl.226"},{"key":"e_1_3_3_2_61_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642632"},{"key":"e_1_3_3_2_62_2","volume-title":"Ofcom\u2019s Guidelines on the Provision of Television Access Services","year":"2021","unstructured":"Ofcom. 2021. Ofcom\u2019s Guidelines on the Provision of Television Access Services. Retrieved Feb. 18, 2025 from https:\/\/www.ofcom.org.uk\/__data\/assets\/pdf_file\/0025\/212776\/provision-of-tv-access-services-guidelines.pdf"},{"key":"e_1_3_3_2_63_2","unstructured":"OpenAI. 2024. GPT-4 Technical Report. arXiv:2303.08774 [cs.CL]."},{"key":"e_1_3_3_2_64_2","first-page":"311","volume-title":"Association for Computational Linguistics (ACL)","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. BLEU: A Method for Automatic Evaluation of Machine Translation. In Association for Computational Linguistics (ACL). 311\u2013318."},{"key":"e_1_3_3_2_65_2","doi-asserted-by":"publisher","DOI":"10.1145\/3379337.3415864"},{"key":"e_1_3_3_2_66_2","unstructured":"Baolin Peng Chunyuan Li Pengcheng He Michel Galley and Jianfeng Gao. 2023. Instruction Tuning with GPT-4. arXiv:2304.03277 [cs.CV]."},{"key":"e_1_3_3_2_67_2","volume-title":"Language Models are Unsupervised Multitask Learners","author":"Radford Alec","year":"2019","unstructured":"Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei, Ilya Sutskever, et\u00a0al. 2019. Language Models are Unsupervised Multitask Learners. Technical Report\u00a01. OpenAI."},{"key":"e_1_3_3_2_68_2","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","author":"Rafailov Rafael","year":"2023","unstructured":"Rafael Rafailov, Archit Sharma, Eric Mitchell, Christopher\u00a0D Manning, Stefano Ermon, and Chelsea Finn. 2023. Direct Preference Optimization: Your Language Model is Secretly a Reward Model. In Advances in Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_2_69_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_31"},{"key":"e_1_3_3_2_70_2","unstructured":"Gunnar\u00a0A. Sigurdsson Abhinav Gupta Cordelia Schmid Ali Farhadi and Karteek Alahari. 2018. Charades-Ego: A Large-Scale Dataset of Paired Third and First Person Videos. arXiv:1804.09626 [cs.CV]."},{"key":"e_1_3_3_2_71_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00497"},{"key":"e_1_3_3_2_72_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642839"},{"key":"e_1_3_3_2_73_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_3_2_74_2","volume-title":"International Conference on Machine Learning (ICML)","author":"Wang Peng","year":"2022","unstructured":"Peng Wang, An Yang, Rui Men, Junyang Lin, Shuai Bai, Zhikang Li, Jianxin Ma, Chang Zhou, Jingren Zhou, and Hongxia Yang. 2022. OFA: Unifying Architectures, Tasks, and Modalities Through a Simple Sequence-to-Sequence Learning Framework. In International Conference on Machine Learning (ICML)."},{"key":"e_1_3_3_2_75_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00468"},{"key":"e_1_3_3_2_76_2","volume-title":"International Conference on Learning Representations (ICLR)","author":"Wang Yi","year":"2024","unstructured":"Yi Wang, Yinan He, Yizhuo Li, Kunchang Li, Jiashuo Yu, Xin Ma, Xinhao Li, Guo Chen, Xinyuan Chen, Yaohui Wang, Conghui He, Ping Luo, Ziwei Liu, Yali Wang, Limin Wang, and Yu Qiao. 2024. InternVid: A Large-scale Video-Text Dataset for Multimodal Understanding and Generation. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_2_77_2","doi-asserted-by":"crossref","unstructured":"Zhanyu Wang Longyue Wang Minghao Wu Zhen Zhao Chenyang Lyu Huayang Li Deng Cai Luping Zhou Shuming Shi and Zhaopeng Tu. 2023. GPT4Video: A Unified Multimodal Large Language Model for lnstruction-Followed Understanding and Safety-Aware Generation. Computing Research Repository (CoRR) (2023).","DOI":"10.1145\/3664647.3681464"},{"key":"e_1_3_3_2_78_2","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, brian ichter, Fei Xia, Ed\u00a0H. Chi, Quoc\u00a0V Le, and Denny Zhou. 2022. Chain of Thought Prompting Elicits Reasoning in Large Language Models. In Advances in Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_2_79_2","unstructured":"Haiyang Xu Qinghao Ye Xuan Wu Ming Yan Yuan Miao Jiabo Ye Guohai Xu Anwen Hu Yaya Shi Guangwei Xu Chenliang Li Qi Qian Maofei Que Ji Zhang Xiao Zeng and Fei Huang. 2023. Youku-mPLUG: A 10 Million Large-scale Chinese Video-Language Dataset for Pre-training and Benchmarks. arXiv:2306.04362 [cs.CV]."},{"key":"e_1_3_3_2_80_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"e_1_3_3_2_81_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00498"},{"key":"e_1_3_3_2_82_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01032"},{"key":"e_1_3_3_2_83_2","volume-title":"International Conference on Learning Representations (ICLR)","author":"Yang Chengrun","year":"2024","unstructured":"Chengrun Yang, Xuezhi Wang, Yifeng Lu, Hanxiao Liu, Quoc\u00a0V Le, Denny Zhou, and Xinyun Chen. 2024. Large Language Models as Optimizers. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_2_84_2","volume-title":"YouDescribe","year":"2024","unstructured":"YouDescribe. 2024. YouDescribe. Retrieved Feb. 18, 2025 from https:\/\/www.youdescribe.org\/"},{"key":"e_1_3_3_2_85_2","doi-asserted-by":"crossref","unstructured":"En Yu Liang Zhao Yana Wei Jinrong Yang Dongming Wu Lingyu Kong Haoran Wei Tiancai Wang Zheng Ge Xiangyu Zhang and Wenbing Tao. 2023. Merlin: Empowering Multimodal LLMs with Foresight Minds. arXiv:2312.00589 [cs.CV].","DOI":"10.1007\/978-3-031-73235-5_24"},{"key":"e_1_3_3_2_86_2","doi-asserted-by":"publisher","DOI":"10.1145\/3357236.3395433"},{"key":"e_1_3_3_2_87_2","doi-asserted-by":"publisher","DOI":"10.1145\/3334480.3382821"},{"key":"e_1_3_3_2_88_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01589"},{"key":"e_1_3_3_2_89_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"e_1_3_3_2_90_2","volume-title":"LLaVA-NeXT: A Strong Zero-shot Video Understanding Model","author":"Zhang Yuanhan","year":"2024","unstructured":"Yuanhan Zhang, Bo Li, haotian Liu, Yong\u00a0jae Lee, Liangke Gui, Di Fu, Jiashi Feng, Ziwei Liu, and Chunyuan Li. 2024. LLaVA-NeXT: A Strong Zero-shot Video Understanding Model. Retrieved Feb. 18, 2025 from https:\/\/llava-vl.github.io\/blog\/2024-04-30-llava-next-video\/"},{"key":"e_1_3_3_2_91_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00637"},{"key":"e_1_3_3_2_92_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12342"}],"event":{"name":"CHI 2025: CHI Conference on Human Factors in Computing Systems","location":"Yokohama Japan","acronym":"CHI '25","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Proceedings of the 2025 CHI Conference on Human Factors in Computing Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3706598.3714096","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3706598.3714096","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,7,4]],"date-time":"2025-07-04T05:14:29Z","timestamp":1751606069000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3706598.3714096"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,25]]},"references-count":91,"alternative-id":["10.1145\/3706598.3714096","10.1145\/3706598"],"URL":"https:\/\/doi.org\/10.1145\/3706598.3714096","relation":{},"subject":[],"published":{"date-parts":[[2025,4,25]]},"assertion":[{"value":"2025-04-25","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}