{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T07:57:36Z","timestamp":1776931056182,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,3,23]]},"DOI":"10.1145\/3742413.3789072","type":"proceedings-article","created":{"date-parts":[[2026,3,3]],"date-time":"2026-03-03T11:32:24Z","timestamp":1772537544000},"page":"1514-1525","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Visual Lyrics: Generating Animated Text for Music Lyric Videos with an Augmented Text Editor"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0116-0463","authenticated-orcid":false,"given":"David Chuan-En","family":"Lin","sequence":"first","affiliation":[{"name":"Human-Computer Interaction Institute, Carnegie Mellon University, Pittsburgh, Pennsylvania, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9234-9960","authenticated-orcid":false,"given":"Cuong","family":"Nguyen","sequence":"additional","affiliation":[{"name":"Adobe Research, San Francisco, California, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8798-4580","authenticated-orcid":false,"given":"Hijung Valentina","family":"Shin","sequence":"additional","affiliation":[{"name":"Adobe Research, Cambridge, Massachusetts, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1824-0243","authenticated-orcid":false,"given":"Nikolas","family":"Martelaro","sequence":"additional","affiliation":[{"name":"Human-Computer Interaction Institute, Carnegie Mellon University, Pittsburgh, Pennsylvania, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,3,22]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"2025. Adobe After Effects. https:\/\/www.adobe.com\/products\/aftereffects.html"},{"key":"e_1_3_3_2_3_2","unstructured":"2025. The best place to build test and discover front-end code. https:\/\/codepen.io\/"},{"key":"e_1_3_3_2_4_2","unstructured":"2025. Bring your designs to life with Magic Animate. https:\/\/www.canva.com\/pro\/animator\/"},{"key":"e_1_3_3_2_5_2","unstructured":"2025. Design Made Easy - Adobe Express. https:\/\/www.adobe.com\/express\/"},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"crossref","unstructured":"Maneesh Agrawala Wilmot Li and Floraine Berthouzoz. 2011. Design principles for visual communication. Commun. ACM 54 4 (2011) 60\u201369.","DOI":"10.1145\/1924421.1924439"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"crossref","unstructured":"Max Bain Jaesung Huh Tengda Han and Andrew Zisserman. 2023. Whisperx: Time-accurate speech transcription of long-form audio. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.00747 (2023).","DOI":"10.21437\/Interspeech.2023-78"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2007.366341"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"crossref","unstructured":"Erin Cherry and Celine Latulipe. 2014. Quantifying the creativity support of digital tools through the creativity support index. ACM Transactions on Computer-Human Interaction (TOCHI) 21 4 (2014) 1\u201325.","DOI":"10.1145\/2617588"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","DOI":"10.1145\/1357054.1357122"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"crossref","unstructured":"Alain De\u00a0Cheveign\u00e9 and Hideki Kawahara. 2002. YIN a fundamental frequency estimator for speech and music. The Journal of the Acoustical Society of America 111 4 (2002) 1917\u20131930.","DOI":"10.1121\/1.1458024"},{"key":"e_1_3_3_2_12_2","unstructured":"Fitton Music. 2025. Vibrato. https:\/\/www.fittonmusic.com\/writing\/noise\/filtering\/vibrato.html."},{"key":"e_1_3_3_2_13_2","first-page":"377","volume-title":"Proceedings of the SIGCHI conference on Human factors in computing systems","author":"Forlizzi Jodi","year":"2003","unstructured":"Jodi Forlizzi, Johnny Lee, and Scott Hudson. 2003. The kinedit system: affective messages using dynamic texts. In Proceedings of the SIGCHI conference on Human factors in computing systems. 377\u2013384."},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"crossref","unstructured":"Hiromasa Fujihara Masataka Goto Jun Ogata and Hiroshi\u00a0G Okuno. 2011. LyricSynchronizer: Automatic synchronization system between musical audio signals and lyrics. IEEE Journal of Selected Topics in Signal Processing 5 6 (2011) 1252\u20131261.","DOI":"10.1109\/JSTSP.2011.2159577"},{"key":"e_1_3_3_2_15_2","first-page":"311","volume-title":"ISMIR","author":"Goto Masataka","year":"2011","unstructured":"Masataka Goto, Kazuyoshi Yoshii, Hiromasa Fujihara, Matthias Mauch, and Tomoyasu Nakano. 2011. Songle: A Web Service for Active Music Listening Improved by User Contributions.. In ISMIR. Citeseer, 311\u2013316."},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"crossref","unstructured":"Romain Hennequin Anis Khlif Felix Voituret and Manuel Moussallam. 2020. Spleeter: a fast and efficient music source separation tool with pre-trained models. Journal of Open Source Software 5 50 (2020) 2154.","DOI":"10.21105\/joss.02154"},{"key":"e_1_3_3_2_17_2","unstructured":"Edward\u00a0J Hu Yelong Shen Phillip Wallis Zeyuan Allen-Zhu Yuanzhi Li Shean Wang Lu Wang and Weizhu Chen. 2021. Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2106.09685 (2021)."},{"key":"e_1_3_3_2_18_2","unstructured":"Juyong Jiang Fan Wang Jiasi Shen Sungju Kim and Sunghun Kim. 2024. A survey on large language models for code generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.00515 (2024)."},{"key":"e_1_3_3_2_19_2","unstructured":"Duke\u00a0& Jones and Louis Theroux. 2022. Jiggle Jiggle. https:\/\/genius.com\/Duke-and-jones-and-louis-theroux-jiggle-jiggle-lyrics."},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3580931"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","DOI":"10.1145\/2702123.2702140"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.1145\/2556288.2556987"},{"key":"e_1_3_3_2_23_2","unstructured":"Black\u00a0Forest Labs. 2024. FLUX. https:\/\/github.com\/black-forest-labs\/flux."},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.1145\/571985.571997"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","DOI":"10.1145\/320719.322594"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"crossref","unstructured":"James\u00a0R Lewis. 2018. The system usability scale: past present and future. International Journal of Human\u2013Computer Interaction 34 7 (2018) 577\u2013590.","DOI":"10.1080\/10447318.2018.1455307"},{"key":"e_1_3_3_2_27_2","unstructured":"Patrick Lewis Ethan Perez Aleksandra Piktus Fabio Petroni Vladimir Karpukhin Naman Goyal Heinrich K\u00fcttler Mike Lewis Wen-tau Yih Tim Rockt\u00e4schel et\u00a0al. 2020. Retrieval-augmented generation for knowledge-intensive nlp tasks. Advances in neural information processing systems 33 (2020) 9459\u20139474."},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"crossref","unstructured":"Jiawei Lin Jiaqi Guo Shizhao Sun Zijiang Yang Jian-Guang Lou and Dongmei Zhang. 2023. Layoutprompter: awaken the design ability of large language models. Advances in Neural Information Processing Systems 36 (2023) 43852\u201343879.","DOI":"10.52202\/075280-1902"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"e_1_3_3_2_30_2","unstructured":"Vivian Liu Rubaiat\u00a0Habib Kazi Li-Yi Wei Matthew Fisher Timothy Langlois Seth Walker and Lydia Chilton. 2024. LogoMotion: Visually Grounded Code Generation for Content-Aware Animation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.07065 (2024)."},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606757"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"crossref","unstructured":"Brian McFee Colin Raffel Dawen Liang Daniel\u00a0PW Ellis Matt McVicar Eric Battenberg and Oriol Nieto. 2015. librosa: Audio and music signal analysis in python.SciPy 2015 (2015) 18\u201324.","DOI":"10.25080\/Majora-7b98e3ed-003"},{"key":"e_1_3_3_2_33_2","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748\u20138763."},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"crossref","unstructured":"Casey Reas and Ben Fry. 2006. Processing: programming for the media arts. Ai & Society 20 (2006) 526\u2013538.","DOI":"10.1007\/s00146-006-0050-9"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","DOI":"10.1145\/1101149.1101278"},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"publisher","DOI":"10.1109\/SMC.2016.7844629"},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2007.4376991"},{"key":"e_1_3_3_2_38_2","unstructured":"Soundation. 2025. Butterworth Filter - The low pass filter. https:\/\/soundation.com\/audio-effects\/butterworth-filter."},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00041"},{"key":"e_1_3_3_2_40_2","unstructured":"Tiffany Tseng Ruijia Cheng and Jeffrey Nichols. 2024. Keyframer: Empowering animation design using large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.06071 (2024)."},{"key":"e_1_3_3_2_41_2","unstructured":"Liwenhan Xie Xinhuan Shu Jeon\u00a0Cheol Su Yun Wang Siming Chen and Huamin Qu. 2023. Creating emordle: Animating word cloud for emotion expression. IEEE Transactions on Visualization and Computer Graphics (2023)."},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606813"},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"crossref","unstructured":"Jingfeng Yao Xinggang Wang Shusheng Yang and Baoyuan Wang. 2024. Vitmatte: Boosting image matting with pre-trained plain vision transformers. Information Fusion 103 (2024) 102091.","DOI":"10.1016\/j.inffus.2023.102091"}],"event":{"name":"IUI '26: 31st International Conference on Intelligent User Interfaces","location":"Paphos Cyprus","acronym":"IUI '26","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction","SIGAI ACM Special Interest Group on Artificial Intelligence"]},"container-title":["Proceedings of the 31st International Conference on Intelligent User Interfaces"],"original-title":[],"deposited":{"date-parts":[[2026,3,14]],"date-time":"2026-03-14T13:00:24Z","timestamp":1773493224000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3742413.3789072"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,22]]},"references-count":42,"alternative-id":["10.1145\/3742413.3789072","10.1145\/3742413"],"URL":"https:\/\/doi.org\/10.1145\/3742413.3789072","relation":{},"subject":[],"published":{"date-parts":[[2026,3,22]]},"assertion":[{"value":"2026-03-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}