{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,28]],"date-time":"2026-01-28T13:24:21Z","timestamp":1769606661765,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":68,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755502","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T05:47:42Z","timestamp":1761371262000},"page":"10248-10257","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["UniTalker: Conversational Speech-Visual Synthesis"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-2276-1456","authenticated-orcid":false,"given":"Yifan","family":"Hu","sequence":"first","affiliation":[{"name":"Inner Mongolia University, Hohhot, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4524-7413","authenticated-orcid":false,"given":"Rui","family":"Liu","sequence":"additional","affiliation":[{"name":"Inner Mongolia University, Hohhot, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9160-3848","authenticated-orcid":false,"given":"Yi","family":"Ren","sequence":"additional","affiliation":[{"name":"ByteDance, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0472-2783","authenticated-orcid":false,"given":"Xiang","family":"Yin","sequence":"additional","affiliation":[{"name":"ByteDance, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9158-9401","authenticated-orcid":false,"given":"Haizhou","family":"Li","sequence":"additional","affiliation":[{"name":"SRIBD, School of Data Science, The Chinese University of Hong Kong, Shenzhen, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Funaudiollm: Voice understanding and generation foundation models for natural interaction between humans and llms. arXiv preprint arXiv:2407.04051","author":"An Keyu","year":"2024","unstructured":"Keyu An, Qian Chen, Chong Deng, Zhihao Du, Changfeng Gao, Zhifu Gao, Yue Gu, Ting He, Hangrui Hu, Kai Hu, et al., 2024. Funaudiollm: Voice understanding and generation foundation models for natural interaction between humans and llms. arXiv preprint arXiv:2407.04051 (2024)."},{"key":"e_1_3_2_1_2_1","volume-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Yuhao Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems, Vol. 33 (2020), 12449-12460."},{"key":"e_1_3_2_1_3_1","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang et al. 2025. Qwen2.5-vl technical report. arXiv preprint arXiv:2502.13923 (2025)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-023-17100-3"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.3724\/2096-7004.di.2024.0001"},{"key":"e_1_3_2_1_6_1","volume-title":"Echomimic: Lifelike audio-driven portrait animations through editable landmark conditions. arXiv preprint arXiv:2407.08136","author":"Chen Zhiyuan","year":"2024","unstructured":"Zhiyuan Chen, Jiajiong Cao, Zhiquan Chen, Yuming Li, and Chenguang Ma. 2024. Echomimic: Lifelike audio-driven portrait animations through editable landmark conditions. arXiv preprint arXiv:2407.08136 (2024)."},{"key":"e_1_3_2_1_7_1","volume-title":"Qwen-audio: Advancing universal audio understanding via unified large-scale audio-language models. arXiv preprint arXiv:2311.07919","author":"Chu Yunfei","year":"2023","unstructured":"Yunfei Chu, Jin Xu, Xiaohuan Zhou, Qian Yang, Shiliang Zhang, Zhijie Yan, Chang Zhou, and Jingren Zhou. 2023. Qwen-audio: Advancing universal audio understanding via unified large-scale audio-language models. arXiv preprint arXiv:2311.07919 (2023)."},{"key":"e_1_3_2_1_8_1","volume-title":"Moshi: a speech-text foundation model for real-time dialogue. arXiv preprint arXiv:2410.00037","author":"D\u00e9fossez Alexandre","year":"2024","unstructured":"Alexandre D\u00e9fossez, Laurent Mazar\u00e9, Manu Orsini, Am\u00e9lie Royer, Patrick P\u00e9rez, Herv\u00e9 J\u00e9gou, Edouard Grave, and Neil Zeghidour. 2024. Moshi: a speech-text foundation model for real-time dialogue. arXiv preprint arXiv:2410.00037 (2024)."},{"key":"e_1_3_2_1_9_1","volume-title":"Cosyvoice: A scalable multilingual zero-shot text-to-speech synthesizer based on supervised semantic tokens. arXiv preprint arXiv:2407.05407","author":"Du Zhihao","year":"2024","unstructured":"Zhihao Du, Qian Chen, Shiliang Zhang, Kai Hu, Heng Lu, Yexin Yang, Hangrui Hu, Siqi Zheng, Yue Gu, Ziyang Ma, et al., 2024a. Cosyvoice: A scalable multilingual zero-shot text-to-speech synthesizer based on supervised semantic tokens. arXiv preprint arXiv:2407.05407 (2024)."},{"key":"e_1_3_2_1_10_1","unstructured":"Zhihao Du Yuxuan Wang Qian Chen Xian Shi Xiang Lv Tianyu Zhao Zhifu Gao Yexin Yang Changfeng Gao Hui Wang et al. 2024b. Cosyvoice 2: Scalable streaming speech synthesis with large language models. arXiv preprint arXiv:2412.10117 (2024)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"e_1_3_2_1_12_1","volume-title":"Llama-omni: Seamless speech interaction with large language models. arXiv preprint arXiv:2409.06666","author":"Fang Qingkai","year":"2024","unstructured":"Qingkai Fang, Shoutao Guo, Yan Zhou, Zhengrui Ma, Shaolei Zhang, and Yang Feng. 2024. Llama-omni: Seamless speech interaction with large language models. arXiv preprint arXiv:2409.06666 (2024)."},{"key":"e_1_3_2_1_13_1","volume-title":"Empathyear: An open-source avatar multimodal empathetic chatbot. arXiv preprint arXiv:2406.15177","author":"Fei Hao","year":"2024","unstructured":"Hao Fei, Han Zhang, Bin Wang, Lizi Liao, Qian Liu, and Erik Cambria. 2024. Empathyear: An open-source avatar multimodal empathetic chatbot. arXiv preprint arXiv:2406.15177 (2024)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.5555\/177910.177914"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383460"},{"key":"e_1_3_2_1_16_1","volume-title":"Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems","author":"Heusel Martin","year":"2017","unstructured":"Martin Heusel, Hubert Ramsauer, Thomas Unterthiner, Bernhard Nessler, and Sepp Hochreiter. 2017. Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3084873.3084900"},{"key":"e_1_3_2_1_18_1","volume-title":"A survey on large language models for code generation. arXiv preprint arXiv:2406.00515","author":"Jiang Juyong","year":"2024","unstructured":"Juyong Jiang, Fan Wang, Jiasi Shen, Sungju Kim, and Sunghun Kim. 2024. A survey on large language models for code generation. arXiv preprint arXiv:2406.00515 (2024)."},{"key":"e_1_3_2_1_19_1","volume-title":"Forty-first International Conference on Machine Learning, ICML 2024","author":"Ju Zeqian","year":"2024","unstructured":"Zeqian Ju, Yuancheng Wang, Kai Shen, Xu Tan, et al., 2024. NaturalSpeech 3: Zero-Shot Speech Synthesis with Factorized Codec and Diffusion Models. In Forty-first International Conference on Machine Learning, ICML 2024, Vienna, Austria, July 21-27, 2024. OpenReview.net."},{"key":"e_1_3_2_1_20_1","volume-title":"Unified speech-text pretraining for spoken dialog modeling. arXiv e-prints","author":"Kim Heeseung","year":"2024","unstructured":"Heeseung Kim, Soonshin Seo, Kyeongseok Jeong, Ohsung Kwon, Jungwhan Kim, Jaehong Lee, Eunwoo Song, Myungwoo Oh, Sungroh Yoon, and Kang Min Yoo. 2024. Unified speech-text pretraining for spoken dialog modeling. arXiv e-prints (2024), arXiv-2402."},{"key":"e_1_3_2_1_21_1","volume-title":"Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis. Advances in neural information processing systems","author":"Kong Jungil","year":"2020","unstructured":"Jungil Kong, Jaehyeon Kim, and Jaekyoung Bae. 2020. Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis. Advances in neural information processing systems, Vol. 33 (2020), 17022-17033."},{"key":"e_1_3_2_1_22_1","volume-title":"ViTGAN: Training GANs with Vision Transformers. In The Tenth International Conference on Learning Representations, ICLR 2022","author":"Lee Kwonjoon","year":"2022","unstructured":"Kwonjoon Lee, Huiwen Chang, Lu Jiang, Han Zhang, Zhuowen Tu, and Ce Liu. 2022. ViTGAN: Training GANs with Vision Transformers. In The Tenth International Conference on Learning Representations, ICLR 2022, Virtual Event, April 25-29, 2022. OpenReview.net."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095751"},{"key":"e_1_3_2_1_24_1","volume-title":"Single-codec: Single-codebook speech codec towards high-performance speech generation. arXiv preprint arXiv:2406.07422","author":"Li Hanzhao","year":"2024","unstructured":"Hanzhao Li, Liumeng Xue, Haohan Guo, Xinfa Zhu, Yuanjun Lv, Lei Xie, Yunlin Chen, Hao Yin, and Zhifei Li. 2024a. Single-codec: Single-codebook speech codec towards high-performance speech generation. arXiv preprint arXiv:2406.07422 (2024)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547831"},{"key":"e_1_3_2_1_26_1","volume-title":"European Conference on Computer Vision. Springer, 127-145","author":"Li Jiahe","year":"2024","unstructured":"Jiahe Li, Jiawei Zhang, Xiao Bai, Jin Zheng, Xin Ning, Jun Zhou, and Lin Gu. 2024b. Talkinggaussian: Structure-persistent 3d talking head synthesis via gaussian splatting. In European Conference on Computer Vision. Springer, 127-145."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2024.ACL-LONG.358"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446933"},{"key":"e_1_3_2_1_29_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023a. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2023), 34892-34916."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i17.29833"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681697"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3613823"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0196391"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448291"},{"key":"e_1_3_2_1_35_1","volume-title":"Finite Scalar Quantization: VQ-VAE Made Simple. In The Twelfth International Conference on Learning Representations, ICLR 2024","author":"Mentzer Fabian","year":"2024","unstructured":"Fabian Mentzer, David Minnen, Eirikur Agustsson, and Michael Tschannen. 2024. Finite Scalar Quantization: VQ-VAE Made Simple. In The Twelfth International Conference on Learning Representations, ICLR 2024, Vienna, Austria, May 7-11, 2024. OpenReview.net."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00545"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.3724\/2096-7004.di.2024.0049"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2024.ACL-LONG.860"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413532"},{"key":"e_1_3_2_1_40_1","volume-title":"International Conference on Machine Learning, ICML 2023","volume":"28518","author":"Radford Alec","year":"2023","unstructured":"Alec Radford, Jong Wook Kim, et al., 2023. Robust Speech Recognition via Large-Scale Weak Supervision. In International Conference on Machine Learning, ICML 2023, 23-29 July 2023, Honolulu, Hawaii, USA (Proceedings of Machine Learning Research, Vol. 202). PMLR, 28492-28518. https:\/\/proceedings.mlr.press\/v202\/radford23a.html"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/D19-1410"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2006.881959"},{"key":"e_1_3_2_1_43_1","volume-title":"European Conference on Computer Vision. Springer, 244-260","author":"Tian Linrui","year":"2024","unstructured":"Linrui Tian, Qi Wang, Bang Zhang, and Liefeng Bo. 2024. Emo: Emote portrait alive generating expressive portrait videos with audio2video diffusion model under weak conditions. In European Conference on Computer Vision. Springer, 244-260."},{"key":"e_1_3_2_1_44_1","volume-title":"Neural Discrete Representation Learning. In Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017","author":"van den Oord A\u00e4ron","year":"2017","unstructured":"A\u00e4ron van den Oord, Oriol Vinyals, and Koray Kavukcuoglu. 2017. Neural Discrete Representation Learning. In Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4-9, 2017, Long Beach, CA, USA. 6306-6315. https:\/\/proceedings.neurips.cc\/paper\/2017\/hash\/7a98af17e63a0ac09ce2e96d03992fbc-Abstract.html"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.1192"},{"key":"e_1_3_2_1_46_1","unstructured":"Chengyi Wang Sanyuan Chen Yu Wu Ziqiang Zhang Long Zhou Shujie Liu Zhuo Chen Yanqing Liu Huaming Wang Jinyu Li et al. 2023. Neural codec language models are zero-shot text to speech synthesizers. arXiv preprint arXiv:2301.02111 (2023)."},{"key":"e_1_3_2_1_47_1","first-page":"28281","article-title":"Omnitokenizer: A joint image-video tokenizer for visual generation","volume":"37","author":"Wang Junke","year":"2024","unstructured":"Junke Wang, Yi Jiang, Zehuan Yuan, Bingyue Peng, Zuxuan Wu, and Yu-Gang Jiang. 2024a. Omnitokenizer: A joint image-video tokenizer for visual generation. Advances in Neural Information Processing Systems, Vol. 37 (2024), 28281-28295.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58589-1_42"},{"key":"e_1_3_2_1_49_1","volume-title":"Freeze-omni: A smart and low latency speech-to-speech dialogue model with frozen llm. arXiv preprint arXiv:2411.00774","author":"Wang Xiong","year":"2024","unstructured":"Xiong Wang, Yangze Li, Chaoyou Fu, Yunhang Shen, Lei Xie, Ke Li, Xing Sun, and Long Ma. 2024b. Freeze-omni: A smart and low latency speech-to-speech dialogue model with frozen llm. arXiv preprint arXiv:2411.00774 (2024)."},{"key":"e_1_3_2_1_50_1","volume-title":"Image quality assessment: from error visibility to structural similarity","author":"Wang Zhou","year":"2004","unstructured":"Zhou Wang, Alan C Bovik, Hamid R Sheikh, and Eero P Simoncelli. 2004. Image quality assessment: from error visibility to structural similarity. IEEE transactions on image processing, Vol. 13, 4 (2004), 600-612."},{"key":"e_1_3_2_1_51_1","volume-title":"Aniportrait: Audio-driven synthesis of photorealistic portrait animation. arXiv preprint arXiv:2403.17694","author":"Wei Huawei","year":"2024","unstructured":"Huawei Wei, Zejun Yang, and Zhisheng Wang. 2024. Aniportrait: Audio-driven synthesis of photorealistic portrait animation. arXiv preprint arXiv:2403.17694 (2024)."},{"key":"e_1_3_2_1_52_1","volume-title":"Mini-omni: Language models can hear, talk while thinking in streaming. arXiv preprint arXiv:2408.16725","author":"Xie Zhifei","year":"2024","unstructured":"Zhifei Xie and Changqiao Wu. 2024. Mini-omni: Language models can hear, talk while thinking in streaming. arXiv preprint arXiv:2408.16725 (2024)."},{"key":"e_1_3_2_1_53_1","volume-title":"Bigcodec: Pushing the limits of low-bitrate neural speech codec. arXiv preprint arXiv:2409.05377","author":"Xin Detai","year":"2024","unstructured":"Detai Xin, Xu Tan, Shinnosuke Takamichi, and Hiroshi Saruwatari. 2024. Bigcodec: Pushing the limits of low-bitrate neural speech codec. arXiv preprint arXiv:2409.05377 (2024)."},{"key":"e_1_3_2_1_54_1","volume-title":"Contrastive Preference Optimization: Pushing the Boundaries of LLM Performance in Machine Translation. In Forty-first International Conference on Machine Learning, ICML 2024","author":"Xu Haoran","year":"2024","unstructured":"Haoran Xu, Amr Sharaf, et al., 2024b. Contrastive Preference Optimization: Pushing the Boundaries of LLM Performance in Machine Translation. In Forty-first International Conference on Machine Learning, ICML 2024, Vienna, Austria, July 21-27, 2024. OpenReview.net."},{"key":"e_1_3_2_1_55_1","volume-title":"Hallo: Hierarchical audio-driven visual synthesis for portrait image animation. arXiv preprint arXiv:2406.08801","author":"Xu Mingwang","year":"2024","unstructured":"Mingwang Xu, Hui Li, Qingkun Su, Hanlin Shang, Liwei Zhang, Ce Liu, Jingdong Wang, Yao Yao, and Siyu Zhu. 2024a. Hallo: Hierarchical audio-driven visual synthesis for portrait image animation. arXiv preprint arXiv:2406.08801 (2024)."},{"key":"e_1_3_2_1_56_1","volume-title":"E-chat: Emotion-sensitive Spoken Dialogue System with Large Language Models. In 2024 IEEE 14th International Symposium on Chinese Spoken Language Processing (ISCSLP). IEEE, 586-590","author":"Xue Hongfei","year":"2024","unstructured":"Hongfei Xue, Yuhao Liang, Bingshen Mu, Shiliang Zhang, Mengzhe Chen, Qian Chen, and Lei Xie. 2024. E-chat: Emotion-sensitive Spoken Dialogue System with Large Language Models. In 2024 IEEE 14th International Symposium on Chinese Spoken Language Processing (ISCSLP). IEEE, 586-590."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096905"},{"key":"e_1_3_2_1_58_1","unstructured":"An Yang Baosong Yang Beichen Zhang Binyuan Hui Bo Zheng Bowen Yu Chengyuan Li Dayiheng Liu Fei Huang Haoran Wei et al. 2024. Qwen2. 5 technical report. arXiv preprint arXiv:2412.15115 (2024)."},{"key":"e_1_3_2_1_59_1","volume-title":"Hifi-codec: Group-residual vector quantization for high fidelity audio codec. arXiv preprint arXiv:2305.02765","author":"Yang Dongchao","year":"2023","unstructured":"Dongchao Yang, Songxiang Liu, Rongjie Huang, Jinchuan Tian, Chao Weng, and Yuexian Zou. 2023. Hifi-codec: Group-residual vector quantization for high fidelity audio codec. arXiv preprint arXiv:2305.02765 (2023)."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2022.EMNLP-MAIN.296"},{"key":"e_1_3_2_1_61_1","first-page":"128940","article-title":"An image is worth 32 tokens for reconstruction and generation","volume":"37","author":"Yu Qihang","year":"2024","unstructured":"Qihang Yu, Mark Weber, Xueqing Deng, Xiaohui Shen, Daniel Cremers, and Liang-Chieh Chen. 2024. An image is worth 32 tokens for reconstruction and generation. Advances in Neural Information Processing Systems, Vol. 37 (2024), 128940-128966.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2023.FINDINGS-EMNLP.1055"},{"key":"e_1_3_2_1_63_1","volume-title":"Towards multimodal empathetic response generation: A rich text-speech-vision avatar-based benchmark. arXiv preprint arXiv:2502.04976","author":"Zhang Han","year":"2025","unstructured":"Han Zhang, Zixiang Meng, Meng Luo, Hong Han, Lizi Liao, Erik Cambria, and Hao Fei. 2025. Towards multimodal empathetic response generation: A rich text-speech-vision avatar-based benchmark. arXiv preprint arXiv:2502.04976 (2025)."},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00836"},{"key":"e_1_3_2_1_66_1","volume-title":"Speechtokenizer: Unified speech tokenizer for speech large language models. arXiv preprint arXiv:2308.16692","author":"Zhang Xin","year":"2023","unstructured":"Xin Zhang, Dong Zhang, Shimin Li, Yaqian Zhou, and Xipeng Qiu. 2023c. Speechtokenizer: Unified speech tokenizer for speech large language models. arXiv preprint arXiv:2308.16692 (2023)."},{"key":"e_1_3_2_1_67_1","volume-title":"R1-omni: Explainable omni-multimodal emotion recognition with reinforcement learning. arXiv preprint arXiv:2503.05379","author":"Zhao Jiaxing","year":"2025","unstructured":"Jiaxing Zhao, Xihan Wei, and Liefeng Bo. 2025. R1-omni: Explainable omni-multimodal emotion recognition with reinforcement learning. arXiv preprint arXiv:2503.05379 (2025)."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20071-7_38"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755502","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:22:46Z","timestamp":1765308166000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755502"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":68,"alternative-id":["10.1145\/3746027.3755502","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755502","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}