{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,15]],"date-time":"2026-05-15T20:37:42Z","timestamp":1778877462270,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":40,"publisher":"ACM","funder":[{"name":"Guangdong Basic and Applied Basic Research Foundation","award":["2023A1515011639"],"award-info":[{"award-number":["2023A1515011639"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754584","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:47:18Z","timestamp":1761374838000},"page":"9168-9176","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["VisAug: Facilitating Speech-Rich Web Video Navigation and Engagement with Auto-Generated Visual Augmentations"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0574-1663","authenticated-orcid":false,"given":"Baoquan","family":"Zhao","sequence":"first","affiliation":[{"name":"Sun Yat-sen University, Zhuhai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-1555-7279","authenticated-orcid":false,"given":"Xiaofan","family":"Ma","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-7702-7897","authenticated-orcid":false,"given":"Qianshi","family":"Pang","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2712-4412","authenticated-orcid":false,"given":"Ruomei","family":"Wang","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0400-9366","authenticated-orcid":false,"given":"Fan","family":"Zhou","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9871-4014","authenticated-orcid":false,"given":"Shujin","family":"Lin","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00290"},{"key":"e_1_3_2_1_2_1","volume-title":"Whisperx: Time-accurate speech transcription of long-form audio. arXiv preprint arXiv:2303.00747","author":"Bain Max","year":"2023","unstructured":"Max Bain, Jaesung Huh, Tengda Han, and Andrew Zisserman. 2023. Whisperx: Time-accurate speech transcription of long-form audio. arXiv preprint arXiv:2303.00747 (2023)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612524"},{"key":"e_1_3_2_1_4_1","volume-title":"Re-Imagen: Retrieval-Augmented Text-to-Image Generator. In The Eleventh International Conference on Learning Representations.","author":"Chen Wenhu","year":"2022","unstructured":"Wenhu Chen, Hexiang Hu, Chitwan Saharia, and William W Cohen. 2022a. Re-Imagen: Retrieval-Augmented Text-to-Image Generator. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_1_5_1","volume-title":"Sporthesia: Augmenting sports videos using natural language","author":"Chen Zhutian","year":"2022","unstructured":"Zhutian Chen, Qisen Yang, Xiao Xie, Johanna Beyer, Haijun Xia, Yingcai Wu, and Hanspeter Pfister. 2022b. Sporthesia: Augmenting sports videos using natural language. IEEE transactions on visualization and computer graphics, Vol. 29, 1 (2022), 918-928."},{"key":"e_1_3_2_1_6_1","first-page":"16890","article-title":"Cogview2: Faster and better text-to-image generation via hierarchical transformers","volume":"35","author":"Ding Ming","year":"2022","unstructured":"Ming Ding, Wendi Zheng, Wenyi Hong, and Jie Tang. 2022. Cogview2: Faster and better text-to-image generation via hierarchical transformers. Advances in Neural Information Processing Systems, Vol. 35 (2022), 16890-16902.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3568444.3568455"},{"key":"e_1_3_2_1_8_1","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Amy Yang Angela Fan et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00694"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376842"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3173574.3174106"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413561"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.33011\/lilt.v12i.1377"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-019-08571-4"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376519"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02156"},{"key":"e_1_3_2_1_17_1","first-page":"2800","article-title":"Automatic Expansion of the MRC Psycholinguistic Database Imageability Ratings","author":"Liu Ting","year":"2014","unstructured":"Ting Liu, Kit Cho, George Aaron Broadwell, Samira Shaikh, Tomek Strzalkowski, John Lien, Sarah M Taylor, Laurie Feldman, Boris Yamrom, Nick Webb, et al., 2014. Automatic Expansion of the MRC Psycholinguistic Database Imageability Ratings.. In LREC. 2800-2805.","journal-title":"LREC."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581566"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W18-3028"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581091"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3372278.3390731"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/2858036.2858456"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1080\/17489539.2021.1934982"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612012"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1007\/s12650-023-00909-3"},{"key":"e_1_3_2_1_26_1","volume-title":"Data player: Automatic generation of data videos with narration-animation interplay","author":"Shen Leixian","year":"2023","unstructured":"Leixian Shen, Yizhi Zhang, Haidong Zhang, and Yun Wang. 2023. Data player: Automatic generation of data videos with narration-animation interplay. IEEE Transactions on Visualization and Computer Graphics (2023)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.60"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.147"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612219"},{"key":"e_1_3_2_1_30_1","volume-title":"MRC psycholinguistic database: Machine-usable dictionary, version 2.00. Behavior research methods, instruments, & computers","author":"Wilson Michael","year":"1988","unstructured":"Michael Wilson. 1988. MRC psycholinguistic database: Machine-usable dictionary, version 2.00. Behavior research methods, instruments, & computers, Vol. 20, 1 (1988), 6-10."},{"key":"e_1_3_2_1_31_1","volume-title":"A theoretically motivated method for automatically evaluating texts for gist inferences. Behavior research methods","author":"Wolfe Christopher R","year":"2019","unstructured":"Christopher R Wolfe, Mitchell Dandignac, and Valerie F Reyna. 2019. A theoretically motivated method for automatically evaluating texts for gist inferences. Behavior research methods, Vol. 51 (2019), 2419-2437."},{"key":"e_1_3_2_1_32_1","volume-title":"Composition and Deformance: Measuring Imageability with a Text-to-Image Model. arXiv preprint arXiv:2306.03168","author":"Wu Si","year":"2023","unstructured":"Si Wu and David A Smith. 2023. Composition and Deformance: Measuring Imageability with a Text-to-Image Model. arXiv preprint arXiv:2306.03168 (2023)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3379337.3415882"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/TLT.2022.3216535"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.165"},{"key":"e_1_3_2_1_36_1","volume-title":"Differentiable Prompt Makes Pre-trained Language Models Better Few-shot Learners. In International Conference on Learning Representations.","author":"Zhang Ningyu","year":"2021","unstructured":"Ningyu Zhang, Luoqiu Li, Xiang Chen, Shumin Deng, Zhen Bi, Chuanqi Tan, Fei Huang, and Huajun Chen. 2021. Differentiable Prompt Makes Pre-trained Language Models Better Few-shot Learners. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123406"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1093\/jamia\/ocv123"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2019.00164"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754584","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:13:54Z","timestamp":1765340034000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754584"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":40,"alternative-id":["10.1145\/3746027.3754584","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754584","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}