{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,6]],"date-time":"2026-06-06T16:03:58Z","timestamp":1780761838316,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":68,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100018537","name":"National Science and Technology Major Project","doi-asserted-by":"publisher","award":["2022ZD0119100"],"award-info":[{"award-number":["2022ZD0119100"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100018537","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No. 62441605"],"award-info":[{"award-number":["No. 62441605"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"the StarryNight Science Fund of Zhejiang University Shang- hai Institute for Advanced Study","award":["SN-ZJU-SIAS-0010"],"award-info":[{"award-number":["SN-ZJU-SIAS-0010"]}]},{"name":"Key Research and Development Program of Zhejiang Province","award":["No. 2024C03270"],"award-info":[{"award-number":["No. 2024C03270"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681675","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:41Z","timestamp":1729925981000},"page":"3548-3557","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":31,"title":["GaussianTalker: Speaker-specific Talking Head Synthesis via 3D Gaussian Splatting"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-2426-1014","authenticated-orcid":false,"given":"Hongyun","family":"Yu","sequence":"first","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-5570-0884","authenticated-orcid":false,"given":"Zhan","family":"Qu","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-2650-5080","authenticated-orcid":false,"given":"Qihang","family":"Yu","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5784-0965","authenticated-orcid":false,"given":"Jianchuan","family":"Chen","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6782-830X","authenticated-orcid":false,"given":"Zhonghua","family":"Jiang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8302-8839","authenticated-orcid":false,"given":"Zhiwen","family":"Chen","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0030-8289","authenticated-orcid":false,"given":"Shengyu","family":"Zhang","sequence":"additional","affiliation":[{"name":"Zhejiang University &amp; Shanghai Institute for Advanced Study, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-8932-066X","authenticated-orcid":false,"given":"Jimin","family":"Xu","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2139-8807","authenticated-orcid":false,"given":"Fei","family":"Wu","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-4918-7425","authenticated-orcid":false,"given":"Chengfei","family":"Lv","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-1185-1244","authenticated-orcid":false,"given":"Gang","family":"Yu","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Joon Son Chung, and Andrew Zisserman","author":"Afouras Triantafyllos","year":"2018","unstructured":"Triantafyllos Afouras, Joon Son Chung, and Andrew Zisserman. 2018. LRS3-TED: a large-scale dataset for visual speech recognition. arXiv preprint arXiv:1809.00496 (2018)."},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3596711.3596730"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/311535.311537"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3596711.3596787"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.116"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"crossref","first-page":"413","DOI":"10.1109\/TVCG.2013.249","article-title":"Facewarehouse: A 3d facial expression database for visual computing","volume":"20","author":"Cao Chen","year":"2013","unstructured":"Chen Cao, YanlinWeng, Shun Zhou, Yiying Tong, and Kun Zhou. 2013. Facewarehouse: A 3d facial expression database for visual computing. IEEE Transactions on Visualization and Computer Graphics 20, 3 (2013), 413--425.","journal-title":"IEEE Transactions on Visualization and Computer Graphics"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/FG57933.2023.10042567"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095796"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01234-2_32"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00802"},{"key":"e_1_3_2_2_11_1","volume-title":"Monogaussianavatar: Monocular gaussian point-based head avatar. arXiv preprint arXiv:2312.04558","author":"Chen Yufan","year":"2023","unstructured":"Yufan Chen, LizhenWang, Qijing Li, Hongjiang Xiao, Shengping Zhang, Hongxun Yao, and Yebin Liu. 2023. Monogaussianavatar: Monocular gaussian point-based head avatar. arXiv preprint arXiv:2312.04558 (2023)."},{"key":"e_1_3_2_2_12_1","volume-title":"Voxceleb2: Deep speaker recognition. arXiv preprint arXiv:1806.05622","author":"Chung Joon Son","year":"2018","unstructured":"Joon Son Chung, Arsha Nagrani, and Andrew Zisserman. 2018. Voxceleb2: Deep speaker recognition. arXiv preprint arXiv:1806.05622 (2018)."},{"key":"e_1_3_2_2_13_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 20311--20322","author":"Black Michael J","year":"2022","unstructured":"Michael J Black, and Timo Bolkart. 2022. Emoca: Emotion driven monocular face capture and animation. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 20311--20322."},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/566654.566594"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00854"},{"key":"e_1_3_2_2_16_1","volume-title":"Generative adversarial nets. Advances in neural information processing systems 27","author":"Goodfellow Ian","year":"2014","unstructured":"Ian Goodfellow, Jean Pouget-Abadie, Mehdi Mirza, Bing Xu, David Warde-Farley, Sherjil Ozair, Aaron Courville, and Yoshua Bengio. 2014. Generative adversarial nets. Advances in neural information processing systems 27 (2014)."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00573"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.5555\/3295222.3295408"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.632"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-019-01150-y"},{"key":"e_1_3_2_2_21_1","volume-title":"MNN: A Universal and Efficient Inference Engine. In MLSys.","author":"Jiang Xiaotang","year":"2020","unstructured":"Xiaotang Jiang, HuanWang, Yiliu Chen, ZiqiWu, LichuanWang, Bin Zou, Yafeng Yang, Zongyang Cui, Yu Cai, Tianhang Yu, Chengfei Lv, and Zhihua Wu. 2020. MNN: A Universal and Efficient Inference Engine. In MLSys."},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073658"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"crossref","unstructured":"Bernhard Kerbl Georgios Kopanas Thomas Leimk\u00fchler and George Drettakis. 2023. 3D Gaussian Splatting for Real-Time Radiance Field Rendering. (2023).","DOI":"10.1145\/3592433"},{"key":"e_1_3_2_2_24_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"crossref","unstructured":"Till Kroeger Radu Timofte Dengxin Dai and Luc Van Gool. 2016. Fast Optical Flow using Dense Inverse Search. arXiv:arXiv:1603.03590","DOI":"10.1007\/978-3-319-46493-0_29"},{"key":"e_1_3_2_2_26_1","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision. 7568--7578","author":"Li Jiahe","year":"2023","unstructured":"Jiahe Li, Jiawei Zhang, Xiao Bai, Jun Zhou, and Lin Gu. 2023. Efficient regionaware neural radiance fields for high-fidelity talking portrait synthesis. In Proceedings of the IEEE\/CVF International Conference on Computer Vision. 7568--7578."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3130800.3130813"},{"key":"e_1_3_2_2_28_1","volume-title":"Sifei Liu, Koki Nagano, Umar Iqbal, and Jan Kautz.","author":"Li Xueting","year":"2024","unstructured":"Xueting Li, Shalini De Mello, Sifei Liu, Koki Nagano, Umar Iqbal, and Jan Kautz. 2024. Generalizable One-shot 3D Neural Head Avatar. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19836-6_7"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3478513.3480484"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_24"},{"key":"e_1_3_2_2_32_1","unstructured":"Shigeo Morishima. 1998. Real-time Talking Head Driven by Voice and its Application to Communication and Entertainment. 195--200."},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00581"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"crossref","unstructured":"Pascal Paysan Reinhard Knothe Brian Amberg Sami Romdhani and Thomas Vetter. 2009. A 3D face model for pose and illumination invariant face recognition. In 2009 sixth IEEE international conference on advanced video and signal based surveillance. Ieee 296--301.","DOI":"10.1109\/AVSS.2009.58"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611734"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413532"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01018"},{"key":"e_1_3_2_2_38_1","volume-title":"OpenVoice: Versatile Instant Voice Cloning. arXiv preprint arXiv:2312.01479","author":"Qin Zengyi","year":"2023","unstructured":"Zengyi Qin, Wenliang Zhao, Xumin Yu, and Xin Sun. 2023. OpenVoice: Versatile Instant Voice Cloning. arXiv preprint arXiv:2312.01479 (2023)."},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00121"},{"key":"e_1_3_2_2_40_1","volume-title":"Rig3DGS: Creating Controllable Portraits from Casual Monocular Videos. arXiv preprint arXiv:2402.03723","author":"Rivero Alfredo","year":"2024","unstructured":"Alfredo Rivero, ShahRukh Athar, Zhixin Shu, and Dimitris Samaras. 2024. Rig3DGS: Creating Controllable Portraits from Casual Monocular Videos. arXiv preprint arXiv:2402.03723 (2024)."},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58555-6_4"},{"key":"e_1_3_2_2_42_1","volume-title":"SplattingAvatar: Realistic Real-Time Human Avatars with Mesh-Embedded Gaussian Splatting. arXiv preprint arXiv:2403.05087","author":"Shao Zhijing","year":"2024","unstructured":"Zhijing Shao, Zhaolong Wang, Zhuang Li, Duotun Wang, Xiangru Lin, Yu Zhang, Mingming Fan, and Zeyu Wang. 2024. SplattingAvatar: Realistic Real-Time Human Avatars with Mesh-Embedded Gaussian Splatting. arXiv preprint arXiv:2403.05087 (2024)."},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19775-8_39"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.2196\/35358"},{"key":"e_1_3_2_2_45_1","volume-title":"Real-time neural radiance talking portrait synthesis via audio-spatial decomposition. arXiv preprint arXiv:2211.12368","author":"Tang Jiaxiang","year":"2022","unstructured":"Jiaxiang Tang, Kaisiyuan Wang, Hang Zhou, Xiaokang Chen, Dongliang He, Tianshu Hu, Jingtuo Liu, Gang Zeng, and JingdongWang. 2022. Real-time neural radiance talking portrait synthesis via audio-spatial decomposition. arXiv preprint arXiv:2211.12368 (2022)."},{"key":"e_1_3_2_2_46_1","volume-title":"Proceedings, Part XVI 16","author":"Thies Justus","year":"2020","unstructured":"Justus Thies, Mohamed Elgharib, Ayush Tewari, Christian Theobalt, and Matthias Nie\u00dfner. 2020. Neural voice puppetry: Audio-driven facial reenactment. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XVI 16. Springer, 716--731."},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58517-4_42"},{"key":"e_1_3_2_2_48_1","volume-title":"Representation Learning with Contrastive Predictive Coding. CoRR abs\/1807.03748","author":"van den Oord A\u00e4ron","year":"2018","unstructured":"A\u00e4ron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation Learning with Contrastive Predictive Coding. CoRR abs\/1807.03748 (2018). arXiv:1807.03748 http:\/\/arxiv.org\/abs\/1807.03748"},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-019-01251-8"},{"key":"e_1_3_2_2_50_1","volume-title":"Gaussianhead: High-fidelity head avatars with learnable gaussian derivation. arXiv preprint arXiv:2312.01632","author":"Wang Jie","year":"2023","unstructured":"Jie Wang, Jiu-Cheng Xie, Xianyan Li, Feng Xu, Chi-Man Pun, and Hao Gao. 2023. Gaussianhead: High-fidelity head avatars with learnable gaussian derivation. arXiv preprint arXiv:2312.01632 (2023)."},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58589-1_42"},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_41"},{"key":"e_1_3_2_2_53_1","volume-title":"OpenGL programming guide: the official guide to learning OpenGL, version 1.2","author":"Woo Mason","unstructured":"Mason Woo, Jackie Neider, Tom Davis, and Dave Shreiner. 1999. OpenGL programming guide: the official guide to learning OpenGL, version 1.2. Addison-Wesley Longman Publishing Co., Inc."},{"key":"e_1_3_2_2_54_1","unstructured":"Qiantong Xu Alexei Baevski and Michael Auli. 2021. Simple and Effective Zero-shot Cross-lingual Phoneme Recognition. arXiv:2109.11680 [cs.CL]"},{"key":"e_1_3_2_2_55_1","volume-title":"Dfa-nerf: Personalized talking head generation via disentangled face attributes neural rendering. arXiv preprint arXiv:2201.00791","author":"Yao Shunyu","year":"2022","unstructured":"Shunyu Yao, RuiZhe Zhong, Yichao Yan, Guangtao Zhai, and Xiaokang Yang. 2022. Dfa-nerf: Personalized talking head generation via disentangled face attributes neural rendering. arXiv preprint arXiv:2201.00791 (2022)."},{"key":"e_1_3_2_2_56_1","volume-title":"GeneFace: Generalized and Stable Real-Time Audio-Driven 3D Talking Face Generation. arXiv preprint arXiv:2305.00787","author":"Ye Zhenhui","year":"2023","unstructured":"Zhenhui Ye, Jinzheng He, Ziyue Jiang, Rongjie Huang, Jiawei Huang, Jinglin Liu, Yi Ren, Xiang Yin, Zejun Ma, and Zhou Zhao. 2023. GeneFace: Generalized and Stable Real-Time Audio-Driven 3D Talking Face Generation. arXiv preprint arXiv:2305.00787 (2023)."},{"key":"e_1_3_2_2_57_1","volume-title":"Geneface: Generalized and high-fidelity audio-driven 3d talking face synthesis. arXiv preprint arXiv:2301.13430","author":"Ye Zhenhui","year":"2023","unstructured":"Zhenhui Ye, Ziyue Jiang, Yi Ren, Jinglin Liu, Jinzheng He, and Zhou Zhao. 2023. Geneface: Generalized and high-fidelity audio-driven 3d talking face synthesis. arXiv preprint arXiv:2301.13430 (2023)."},{"key":"e_1_3_2_2_58_1","volume-title":"Audiodriven talking face video generation with natural head pose. arXiv preprint arXiv:2002.10137 2, 6","author":"Yi Ran","year":"2020","unstructured":"Ran Yi, Zipeng Ye, Juyong Zhang, Hujun Bao, and Yong-Jin Liu. 2020. Audiodriven talking face video generation with natural head pose. arXiv preprint arXiv:2002.10137 2, 6 (2020), 7."},{"key":"e_1_3_2_2_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2020.2973374"},{"key":"e_1_3_2_2_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"key":"e_1_3_2_2_61_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i3.25464"},{"key":"e_1_3_2_2_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00366"},{"key":"e_1_3_2_2_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01814"},{"key":"e_1_3_2_2_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00938"},{"key":"e_1_3_2_2_65_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33019299"},{"key":"e_1_3_2_2_66_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3414685.3417774","article-title":"Makelttalk: speaker-aware talking-head animation","volume":"39","author":"Zhou Yang","year":"2020","unstructured":"Yang Zhou, Xintong Han, Eli Shechtman, Jose Echevarria, Evangelos Kalogerakis, and Dingzeyu Li. 2020. Makelttalk: speaker-aware talking-head animation. ACM Transactions On Graphics (TOG) 39, 6 (2020), 1--15.","journal-title":"ACM Transactions On Graphics (TOG)"},{"key":"e_1_3_2_2_67_1","volume-title":"HeadStudio: Text to Animatable Head Avatars with 3D Gaussian Splatting. arXiv preprint arXiv:2402.06149","author":"Zhou Zhenglin","year":"2024","unstructured":"Zhenglin Zhou, Fan Ma, Hehe Fan, and Yi Yang. 2024. HeadStudio: Text to Animatable Head Avatars with 3D Gaussian Splatting. arXiv preprint arXiv:2402.06149 (2024)."},{"key":"e_1_3_2_2_68_1","volume-title":"Computer graphics forum","author":"Zollh\u00f6fer Michael","unstructured":"Michael Zollh\u00f6fer, Justus Thies, Pablo Garrido, Derek Bradley, Thabo Beeler, Patrick P\u00e9rez, Marc Stamminger, Matthias Nie\u00dfner, and Christian Theobalt. 2018. State of the art on monocular 3D face reconstruction, tracking, and applications. In Computer graphics forum, Vol. 37. Wiley Online Library, 523--550."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681675","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681675","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:50Z","timestamp":1750295870000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681675"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":68,"alternative-id":["10.1145\/3664647.3681675","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681675","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}