{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,21]],"date-time":"2026-05-21T17:06:58Z","timestamp":1779383218726,"version":"3.53.1"},"publisher-location":"New York, NY, USA","reference-count":78,"publisher":"ACM","funder":[{"name":"IITP","award":["RS-2025-02263169; RS-2024-00457882"],"award-info":[{"award-number":["RS-2025-02263169; RS-2024-00457882"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754577","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:47:18Z","timestamp":1761374838000},"page":"10758-10767","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["AlignDiT: Multimodal Aligned Diffusion Transformer for Synchronized Speech Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-6817-604X","authenticated-orcid":false,"given":"Jeongsoo","family":"Choi","sequence":"first","affiliation":[{"name":"Korea Advanced Institute of Science and Technology, Daejeon, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-3433-918X","authenticated-orcid":false,"given":"Ji-Hoon","family":"Kim","sequence":"additional","affiliation":[{"name":"Korea Advanced Institute of Science and Technology, Daejeon, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3542-9934","authenticated-orcid":false,"given":"Kim","family":"Sung-Bin","sequence":"additional","affiliation":[{"name":"Pohang University of Science and Technology, Pohang, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0468-1571","authenticated-orcid":false,"given":"Tae-Hyun","family":"Oh","sequence":"additional","affiliation":[{"name":"Korea Advanced Institute of Science and Technology, Daejeon, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7741-7275","authenticated-orcid":false,"given":"Joon Son","family":"Chung","sequence":"additional","affiliation":[{"name":"Korea Advanced Institute of Science and Technology, Daejeon, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Andrew Senior, Oriol Vinyals, and Andrew Zisserman.","author":"Afouras Triantafyllos","year":"2018","unstructured":"Triantafyllos Afouras, Joon Son Chung, Andrew Senior, Oriol Vinyals, and Andrew Zisserman. 2018b. Deep audio-visual speech recognition. IEEE Trans. on Pattern Analysis and Machine Intelligence (2018)."},{"key":"e_1_3_2_1_2_1","volume-title":"Joon Son Chung, and Andrew Zisserman","author":"Afouras Triantafyllos","year":"2018","unstructured":"Triantafyllos Afouras, Joon Son Chung, and Andrew Zisserman. 2018a. LRS3-TED: a large-scale dataset for visual speech recognition. arXiv preprint arXiv:1809.00496 (2018)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-646"},{"key":"e_1_3_2_1_4_1","volume-title":"Proc. NeurIPS.","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Yuhao Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A framework for self-supervised learning of speech representations. In Proc. NeurIPS."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.116"},{"key":"e_1_3_2_1_6_1","volume-title":"Vall-e 2: Neural codec language models are human parity zero-shot text to speech synthesizers. arXiv preprint arXiv:2406.05370","author":"Chen Sanyuan","year":"2024","unstructured":"Sanyuan Chen, Shujie Liu, Long Zhou, Yanqing Liu, Xu Tan, Jinyu Li, Sheng Zhao, Yao Qian, and Furu Wei. 2024. Vall-e 2: Neural codec language models are human parity zero-shot text to speech synthesizers. arXiv preprint arXiv:2406.05370 (2024)."},{"key":"e_1_3_2_1_7_1","volume-title":"Wavlm: Large-scale self-supervised pre-training for full stack speech processing","author":"Chen Sanyuan","year":"2022","unstructured":"Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, et al., 2022. Wavlm: Large-scale self-supervised pre-training for full stack speech processing. IEEE Journal of Selected Topics in Signal Processing (2022)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.313"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00718"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10889780"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-194"},{"key":"e_1_3_2_1_12_1","volume-title":"Proc. Interspeech.","author":"Choi Jeongsoo","year":"2025","unstructured":"Jeongsoo Choi, Zhikang Niu, Ji-Hoon Kim, Chunhui Wang, Joon Son Chung, and Xie Chen. 2025b. Accelerating Diffusion-based Text-to-Speech Model Training with Dual Modality Alignment. In Proc. Interspeech."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02580"},{"key":"e_1_3_2_1_14_1","volume-title":"ACCV workshop.","author":"Chung Joon Son","year":"2016","unstructured":"Joon Son Chung and Andrew Zisserman. 2016. Out of time: automated lip sync in the wild. In ACCV workshop."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01411"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Gaoxiang Cong Yuankai Qi Liang Li Amin Beheshti Zhedong Zhang Anton van den Hengel Ming-Hsuan Yang Chenggang Yan and Qingming Huang. 2024. Styledubber: towards multi-scale style learning for movie dubbing. In Findings of ACL.","DOI":"10.18653\/v1\/2024.findings-acl.404"},{"key":"e_1_3_2_1_17_1","volume-title":"High Fidelity Neural Audio Compression. Trans. on Machine Learning Research","author":"D\u00e9fossez Alexandre","year":"2023","unstructured":"Alexandre D\u00e9fossez, Jade Copet, Gabriel Synnaeve, and Yossi Adi. 2023. High Fidelity Neural Audio Compression. Trans. on Machine Learning Research (2023)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00525"},{"key":"e_1_3_2_1_19_1","volume-title":"The multimodal facilitation effect in human communication. Psychonomic Bulletin & Review","author":"Drijvers Linda","year":"2023","unstructured":"Linda Drijvers and Judith Holler. 2023. The multimodal facilitation effect in human communication. Psychonomic Bulletin & Review (2023)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953127"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/SLT61566.2024.10832320"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01033"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i1.19966"},{"key":"e_1_3_2_1_26_1","volume-title":"Proc. NeurIPS","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Proc. NeurIPS (2020)."},{"key":"e_1_3_2_1_27_1","volume-title":"Classifier-Free Diffusion Guidance. In NeurIPS workshop.","author":"Ho Jonathan","year":"2021","unstructured":"Jonathan Ho and Tim Salimans. 2021. Classifier-Free Diffusion Guidance. In NeurIPS workshop."},{"key":"e_1_3_2_1_28_1","volume-title":"Multimodal language processing in human communication. Trends in Cognitive Sciences","author":"Holler Judith","year":"2019","unstructured":"Judith Holler and Stephen C Levinson. 2019. Multimodal language processing in human communication. Trends in Cognitive Sciences (2019)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01801"},{"key":"e_1_3_2_1_30_1","volume-title":"Kushal Lakhotia","author":"Hsu Wei-Ning","year":"2021","unstructured":"Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, and Abdelrahman Mohamed. 2021. Hubert: Self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM Trans. on Audio, Speech, and Language Processing (2021)."},{"key":"e_1_3_2_1_31_1","volume-title":"Proc. NeurIPS.","author":"Hu Chenxu","year":"2021","unstructured":"Chenxu Hu, Qiao Tian, Tingle Li, Wang Yuping, Yuxuan Wang, and Hang Zhao. 2021. Neural dubber: Dubbing for videos according to scripts. In Proc. NeurIPS."},{"key":"e_1_3_2_1_32_1","volume-title":"read and prompt: High-fidelity text-to-speech with minimal supervision. Trans. of the Association for Computational Linguistics","author":"Kharitonov Eugene","year":"2023","unstructured":"Eugene Kharitonov, Damien Vincent, Zal\u00e1n Borsos, Rapha\u00ebl Marinier, Sertan Girgin, Olivier Pietquin, Matt Sharifi, Marco Tagliasacchi, and Neil Zeghidour. 2023. Speak, read and prompt: High-fidelity text-to-speech with minimal supervision. Trans. of the Association for Computational Linguistics (2023)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01480"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i3.28055"},{"key":"e_1_3_2_1_35_1","volume-title":"Proc. NeurIPS.","author":"Kim Minsu","year":"2021","unstructured":"Minsu Kim, Joanna Hong, and Yong Man Ro. 2021. Lip to speech synthesis with visual context attentional GAN. In Proc. NeurIPS."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i7.25998"},{"key":"e_1_3_2_1_37_1","volume-title":"Proc. NeurIPS.","author":"Kong Jungil","year":"2020","unstructured":"Jungil Kong, Jaehyeon Kim, and Jaekyoung Bae. 2020. Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis. In Proc. NeurIPS."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33012588"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-60276-5_27"},{"key":"e_1_3_2_1_40_1","volume-title":"Proc. NeurIPS.","author":"Le Matthew","year":"2023","unstructured":"Matthew Le, Apoorv Vyas, Bowen Shi, Brian Karrer, Leda Sari, Rashel Moritz, Mary Williamson, Vimal Manohar, Yossi Adi, Jay Mahadeokar, et al., 2023. Voicebox: Text-guided multilingual universal speech generation at scale. In Proc. NeurIPS."},{"key":"e_1_3_2_1_41_1","volume-title":"Proc. ICLR.","author":"Lee Keon","year":"2025","unstructured":"Keon Lee, Dong Won Kim, Jaehyeon Kim, and Jaewoong Cho. 2025. Ditto-tts: Efficient and scalable zero-shot text-to-speech with diffusion transformer. In Proc. ICLR."},{"key":"e_1_3_2_1_42_1","volume-title":"Proc. ICLR.","author":"Lipman Yaron","year":"2023","unstructured":"Yaron Lipman, Ricky TQ Chen, Heli Ben-Hamu, Maximilian Nickel, and Matt Le. 2023. Flow matching for generative modeling. In Proc. ICLR."},{"key":"e_1_3_2_1_43_1","volume-title":"Proc. ICLR.","author":"Liu Alexander H","year":"2024","unstructured":"Alexander H Liu, Matt Le, Apoorv Vyas, Bowen Shi, Andros Tjandra, and Wei-Ning Hsu. 2024. Generative pre-training for speech with flow matching. In Proc. ICLR."},{"key":"e_1_3_2_1_44_1","volume-title":"Proc. ICLR.","author":"Loshchilov Ilya","year":"2019","unstructured":"Ilya Loshchilov and Frank Hutter. 2019. Decoupled Weight Decay Regularization. In Proc. ICLR."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746421"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096889"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00290"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095710"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1386"},{"key":"e_1_3_2_1_50_1","volume-title":"Hearing lips and seeing voices. Nature","author":"McGurk Harry","year":"1976","unstructured":"Harry McGurk and John MacDonald. 1976. Hearing lips and seeing voices. Nature (1976)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448291"},{"key":"e_1_3_2_1_52_1","volume-title":"Proc. Interspeech.","author":"Mira Rodrigo","year":"2022","unstructured":"Rodrigo Mira, Alexandros Haliassos, Stavros Petridis, Bj\u00f6rn W Schuller, and Maja Pantic. 2022a. SVTS: Scalable Video-to-Speech Synthesis. In Proc. Interspeech."},{"key":"e_1_3_2_1_53_1","volume-title":"End-to-end video-to-speech synthesis using generative adversarial networks","author":"Mira Rodrigo","year":"2022","unstructured":"Rodrigo Mira, Konstantinos Vougioukas, Pingchuan Ma, Stavros Petridis, Bj\u00f6rn W Schuller, and Maja Pantic. 2022b. End-to-end video-to-speech synthesis using generative adversarial networks. IEEE Trans. on Cybernetics (2022)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.5244\/C.34.10"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20102"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639643"},{"key":"e_1_3_2_1_59_1","volume-title":"Proc. BMVC.","author":"Prajwal KR","year":"2021","unstructured":"KR Prajwal, Liliane Momeni, Triantafyllos Afouras, and Andrew Zisserman. 2021. Visual keyword spotting with attention. In Proc. BMVC."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01381"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413532"},{"key":"e_1_3_2_1_62_1","volume-title":"Proc. ICML.","author":"Radford Alec","year":"2023","unstructured":"Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever. 2023. Robust speech recognition via large-scale weak supervision. In Proc. ICML."},{"key":"e_1_3_2_1_63_1","volume-title":"Proc. ICLR","author":"Ren Yi","year":"2021","unstructured":"Yi Ren, Chenxu Hu, Xu Tan, Tao Qin, Sheng Zhao, Zhou Zhao, and Tie-Yan Liu. 2021. Fastspeech 2: Fast and high-quality end-to-end text to speech. Proc. ICLR (2021)."},{"key":"e_1_3_2_1_64_1","volume-title":"Proc. NeurIPS.","author":"Ren Yi","year":"2019","unstructured":"Yi Ren, Yangjun Ruan, Xu Tan, Tao Qin, Sheng Zhao, Zhou Zhao, and Tie-Yan Liu. 2019. Fastspeech: Fast, robust and controllable text to speech. In Proc. NeurIPS."},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1873"},{"key":"e_1_3_2_1_66_1","volume-title":"Proc. ICLR.","author":"Shi Bowen","year":"2022","unstructured":"Bowen Shi, Wei-Ning Hsu, Kushal Lakhotia, and Abdelrahman Mohamed. 2022. Learning audio-visual speech representation by masked multimodal cluster prediction. In Proc. ICLR."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.367"},{"key":"e_1_3_2_1_68_1","volume-title":"Score-based generative modeling through stochastic differential equations. arXiv preprint arXiv:2011.13456","author":"Song Yang","year":"2020","unstructured":"Yang Song, Jascha Sohl-Dickstein, Diederik P Kingma, Abhishek Kumar, Stefano Ermon, and Ben Poole. 2020. Score-based generative modeling through stochastic differential equations. arXiv preprint arXiv:2011.13456 (2020)."},{"key":"e_1_3_2_1_69_1","volume-title":"Proc. NeurIPS.","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Proc. NeurIPS."},{"key":"e_1_3_2_1_70_1","unstructured":"Chengyi Wang Sanyuan Chen Yu Wu Ziqiang Zhang Long Zhou Shujie Liu Zhuo Chen Yanqing Liu Huaming Wang Jinyu Li et al. 2023. Neural codec language models are zero-shot text to speech synthesizers. arXiv preprint arXiv:2301.02111 (2023)."},{"key":"e_1_3_2_1_71_1","volume-title":"So Kweon, and Saining Xie. 2023. Convnext v2: Co-designing and scaling convnets with masked autoencoders. In Proc. CVPR.","author":"Woo Sanghyun","unstructured":"Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, and Saining Xie. 2023. Convnext v2: Co-designing and scaling convnets with masked autoencoders. In Proc. CVPR."},{"key":"e_1_3_2_1_72_1","volume-title":"Proc. NeurIPS.","author":"Xu Sicheng","year":"2024","unstructured":"Sicheng Xu, Guojun Chen, Yu-Xiao Guo, Jiaolong Yang, Chong Li, Zhenyu Zang, Yizhong Zhang, Xin Tong, and Baining Guo. 2024. Vasa-1: Lifelike audio-driven talking faces generated in real time. In Proc. NeurIPS."},{"key":"e_1_3_2_1_73_1","volume-title":"Audio-Visual Speech Representation Expert for Enhanced Talking Face Video Generation and Evaluation. In CVPR workshop.","author":"Yaman Dogucan","year":"2024","unstructured":"Dogucan Yaman, Fevziye Irem Eyiokur, Leonard B\u00e4rmann, Seymanur Akti, Hazim Kemal Ekenel, and Alexander Waibel. 2024. Audio-Visual Speech Representation Expert for Enhanced Talking Face Video Generation and Evaluation. In CVPR workshop."},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-1392"},{"key":"e_1_3_2_1_75_1","volume-title":"Proc. ICLR.","author":"Yemini Yochai","year":"2024","unstructured":"Yochai Yemini, Aviv Shamsian, Lior Bracha, Sharon Gannot, and Ethan Fetaya. 2024. LipVoicer: Generating Speech from Silent Videos Guided by Lip Reading. In Proc. ICLR."},{"key":"e_1_3_2_1_76_1","volume-title":"Dae Hoe Kim, and Yong Man Ro","author":"Yeo Jeong Hun","year":"2024","unstructured":"Jeong Hun Yeo, Minsu Kim, Jeongsoo Choi, Dae Hoe Kim, and Yong Man Ro. 2024. Akvsr: Audio knowledge empowered visual speech recognition by compressing audio knowledge of a pretrained model. IEEE Trans. on Multimedia (2024)."},{"key":"e_1_3_2_1_77_1","volume-title":"Proc. ICLR.","author":"Yu Sihyun","year":"2025","unstructured":"Sihyun Yu, Sangkyung Kwak, Huiwon Jang, Jongheon Jeong, Jonathan Huang, Jinwoo Shin, and Saining Xie. 2025. Representation Alignment for Generation: Training Diffusion Transformers Is Easier Than You Think. In Proc. ICLR."},{"key":"e_1_3_2_1_78_1","volume-title":"Soundstream: An end-to-end neural audio codec","author":"Zeghidour Neil","year":"2021","unstructured":"Neil Zeghidour, Alejandro Luebs, Ahmed Omran, Jan Skoglund, and Marco Tagliasacchi. 2021. Soundstream: An end-to-end neural audio codec. IEEE\/ACM Trans. on Audio, Speech, and Language Processing (2021)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754577","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:12:44Z","timestamp":1765339964000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754577"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":78,"alternative-id":["10.1145\/3746027.3754577","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754577","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}