{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T02:51:09Z","timestamp":1765507869039,"version":"3.48.0"},"publisher-location":"New York, NY, USA","reference-count":33,"publisher":"ACM","funder":[{"name":"Guangdong Provincial Key Lab of Integrated Communication, Sensing and Computation for Ubiquitous Internet of Things","award":["No.2023B1212010007"],"award-info":[{"award-number":["No.2023B1212010007"]}]},{"name":"China NSFC Grant","award":["No.62472366"],"award-info":[{"award-number":["No.62472366"]}]},{"name":"111 Center","award":["No.D25008"],"award-info":[{"award-number":["No.D25008"]}]},{"name":"the Project of DEGP","award":["No.2024GCZX003, 2023KCXTD042"],"award-info":[{"award-number":["No.2024GCZX003, 2023KCXTD042"]}]},{"name":"Shenzhen Science and Technology Foundation","award":["ZDSYS20190902092853047"],"award-info":[{"award-number":["ZDSYS20190902092853047"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,10]]},"DOI":"10.1145\/3746252.3761206","type":"proceedings-article","created":{"date-parts":[[2025,11,8]],"date-time":"2025-11-08T01:03:27Z","timestamp":1762563807000},"page":"2811-2820","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Hearing the Meaning, Not the Mess: Beyond Literal Transcription for Spoken Language"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-0071-5392","authenticated-orcid":false,"given":"Min","family":"Sun","sequence":"first","affiliation":[{"name":"China Mobile Information Technology Co., Ltd, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-6600-0554","authenticated-orcid":false,"given":"Ke","family":"Xu","sequence":"additional","affiliation":[{"name":"China Mobile Information Technology Co., Ltd, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-8963-5138","authenticated-orcid":false,"given":"Jiarong","family":"Liu","sequence":"additional","affiliation":[{"name":"China Mobile Information Technology Co., Ltd, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1334-1827","authenticated-orcid":false,"given":"Jifan","family":"Yang","sequence":"additional","affiliation":[{"name":"China Mobile Information Technology Co., Ltd, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-4434-4718","authenticated-orcid":false,"given":"Yan","family":"Fang","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5879-585X","authenticated-orcid":false,"given":"Weizheng","family":"Wang","sequence":"additional","affiliation":[{"name":"The Hong Kong Polytechnic University, Hong Kong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5500-0249","authenticated-orcid":false,"given":"Qipeng","family":"Xie","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-1758-2870","authenticated-orcid":false,"given":"Shuxin","family":"Zhong","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2216-0737","authenticated-orcid":false,"given":"Kaishun","family":"Wu","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,11,10]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747756"},{"key":"e_1_3_2_1_2_1","volume-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Yuhao Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems, Vol. 33 (2020), 12449-12460."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i21.30381"},{"key":"e_1_3_2_1_4_1","volume-title":"mslam: Massively multilingual joint pre-training for speech and text. arXiv preprint arXiv:2202.01374","author":"Bapna Ankur","year":"2022","unstructured":"Ankur Bapna, Colin Cherry, Yu Zhang, Ye Jia, Melvin Johnson, Yong Cheng, Simran Khanuja, Jason Riesa, and Alexis Conneau. 2022. mslam: Massively multilingual joint pre-training for speech and text. arXiv preprint arXiv:2202.01374 (2022)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"volume-title":"Analyzing emotion in spontaneous speech","author":"Chakraborty Rupayan","key":"e_1_3_2_1_6_1","unstructured":"Rupayan Chakraborty, Meghna Pandharipande, and Sunil Kumar Kopparapu. 2017. Analyzing emotion in spontaneous speech. Springer."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472621"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"e_1_3_2_1_9_1","volume-title":"Polyvoice: Language models for speech to speech translation. arXiv preprint arXiv:2306.02982","author":"Dong Qianqian","year":"2023","unstructured":"Qianqian Dong, Zhiying Huang, Qiao Tian, Chen Xu, Tom Ko, Yunlong Zhao, Siyuan Feng, Tang Li, Kexin Wang, Xuxin Cheng, et al., 2023. Polyvoice: Language models for speech to speech translation. arXiv preprint arXiv:2306.02982 (2023)."},{"key":"e_1_3_2_1_10_1","volume-title":"Sequence transduction with recurrent neural networks. arXiv preprint arXiv:1211.3711","author":"Graves Alex","year":"2012","unstructured":"Alex Graves. 2012. Sequence transduction with recurrent neural networks. arXiv preprint arXiv:1211.3711 (2012)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"key":"e_1_3_2_1_12_1","volume-title":"Conformer: Convolution-augmented transformer for speech recognition. arXiv preprint arXiv:2005.08100","author":"Gulati Anmol","year":"2020","unstructured":"Anmol Gulati, James Qin, Chung-Cheng Chiu, Niki Parmar, Yu Zhang, Jiahui Yu, Wei Han, Shibo Wang, Zhengdong Zhang, Yonghui Wu, et al., 2020. Conformer: Convolution-augmented transformer for speech recognition. arXiv preprint arXiv:2005.08100 (2020)."},{"key":"e_1_3_2_1_13_1","volume-title":"Proc. Interspeech","author":"Jiayu Du Wei-Qiang Guanbo Wang","year":"2021","unstructured":"Guanbo Wang Jiayu Du Wei-Qiang Zhang Chao Weng Dan Su Daniel Povey Jan Trmal Junbo Zhang Mingjie Jin Sanjeev Khudanpur Shinji Watanabe Shuaijiang Zhao Wei Zou Xiangang Li Xuchen Yao Yongqing Wang Yujun Wang Zhao You Zhiyong Yan Guoguo Chen, Shuzhou Chai. 2021. GigaSpeech: An Evolving, Multi-domain ASR Corpus with 10,000 Hours of Transcribed Audio. In Proc. Interspeech 2021."},{"key":"e_1_3_2_1_14_1","unstructured":"Awni Hannun Carl Case Jared Casper Bryan Catanzaro Greg Diamos Erich Elsen Ryan Prenger Sanjeev Satheesh Shubho Sengupta Adam Coates et al. 2014. Deep speech: Scaling up end-to-end speech recognition. arXiv preprint arXiv:1412.5567 (2014)."},{"key":"e_1_3_2_1_15_1","volume-title":"Kushal Lakhotia","author":"Hsu Wei-Ning","year":"2021","unstructured":"Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, and Abdelrahman Mohamed. 2021. Hubert: Self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM transactions on audio, speech, and language processing, Vol. 29 (2021), 3451-3460."},{"key":"e_1_3_2_1_16_1","volume-title":"Ruizhe Li, Chao Zhang, Pin-Yu Chen, and EnSiong Chng.","author":"Hu Yuchen","year":"2024","unstructured":"Yuchen Hu, Chen Chen, Chao-Han Huck Yang, Ruizhe Li, Chao Zhang, Pin-Yu Chen, and EnSiong Chng. 2024. Large language models are efficient learners of noise-robust speech recognition. arXiv preprint arXiv:2401.10446 (2024)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i23.34610"},{"key":"e_1_3_2_1_18_1","first-page":"1","article-title":"A path towards autonomous machine intelligence version 0.9. 2, 2022-06-27","volume":"62","author":"LeCun Yann","year":"2022","unstructured":"Yann LeCun. 2022. A path towards autonomous machine intelligence version 0.9. 2, 2022-06-27. Open Review, Vol. 62, 1 (2022), 1-62.","journal-title":"Open Review"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9003906"},{"key":"e_1_3_2_1_20_1","volume-title":"Self-supervised learning: Generative or contrastive","author":"Liu Xiao","year":"2021","unstructured":"Xiao Liu, Fanjin Zhang, Zhenyu Hou, Li Mian, Zhaoyu Wang, Jing Zhang, and Jie Tang. 2021. Self-supervised learning: Generative or contrastive. IEEE transactions on knowledge and data engineering, Vol. 35, 1 (2021), 857-876."},{"volume-title":"Spoken dialogue technology: toward the conversational user interface","author":"McTear Michael F","key":"e_1_3_2_1_21_1","unstructured":"Michael F McTear. 2004. Spoken dialogue technology: toward the conversational user interface. Springer Science & Business Media."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/2733373.2806390"},{"key":"e_1_3_2_1_23_1","volume-title":"International conference on machine learning. PMLR, 28492-28518","author":"Radford Alec","year":"2023","unstructured":"Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever. 2023. Robust speech recognition via large-scale weak supervision. In International conference on machine learning. PMLR, 28492-28518."},{"key":"e_1_3_2_1_24_1","volume-title":"Multi-resolution HuBERT: Multi-resolution speech self-supervised learning with masked unit prediction. arXiv preprint arXiv:2310.02720","author":"Shi Jiatong","year":"2023","unstructured":"Jiatong Shi, Hirofumi Inaguma, Xutai Ma, Ilia Kulikov, and Anna Sun. 2023. Multi-resolution HuBERT: Multi-resolution speech self-supervised learning with masked unit prediction. arXiv preprint arXiv:2310.02720 (2023)."},{"key":"e_1_3_2_1_25_1","volume-title":"Proceedings of the Thirty-Third International Joint Conference on Artificial Intelligence. 6478-6485","author":"Tang Yixuan","year":"2024","unstructured":"Yixuan Tang and Anthony KH Tung. 2024. Contextualized speech recognition: rethinking second-pass rescoring with generative large language models. In Proceedings of the Thirty-Third International Joint Conference on Artificial Intelligence. 6478-6485."},{"key":"e_1_3_2_1_26_1","first-page":"36","article-title":"Speech to text and text to speech recognition systems-Areview","volume":"20","author":"Trivedi Ayushi","year":"2018","unstructured":"Ayushi Trivedi, Navya Pant, Pinal Shah, Simran Sonik, and Supriya Agrawal. 2018. Speech to text and text to speech recognition systems-Areview. IOSR J. Comput. Eng, Vol. 20, 2 (2018), 36-43.","journal-title":"IOSR J. Comput. Eng"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","unstructured":"Changhan Wang Morgane Riviere Ann Lee Anne Wu Chaitanya Talnikar Daniel Haziza Mary Williamson Juan Pino and Emmanuel Dupoux. 2021. VoxPopuli: A Large-Scale Multilingual Speech Corpus for Representation Learning Semi-Supervised Learning and Interpretation. In Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers). Association for Computational Linguistics Online 993-1003. https:\/\/aclanthology.org\/2021.acl-long.80","DOI":"10.18653\/v1\/2021.acl-long.80"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2017.2763455"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096988"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i24.34741"},{"key":"e_1_3_2_1_31_1","volume-title":"Proc. Int. Conf. Learn. Representations. 1-21","author":"Xin Z","year":"2024","unstructured":"Z Xin, Z Dong, L Shimin, Z Yaqian, and Q Xipeng. 2024. Speechtokenizer: Unified speech tokenizer for speech language models. In Proc. Int. Conf. Learn. Representations. 1-21."},{"key":"e_1_3_2_1_32_1","volume-title":"Zipformer: A faster and better encoder for automatic speech recognition. arXiv preprint arXiv:2310.11230","author":"Yao Zengwei","year":"2023","unstructured":"Zengwei Yao, Liyong Guo, Xiaoyu Yang, Wei Kang, Fangjun Kuang, Yifan Yang, Zengrui Jin, Long Lin, and Daniel Povey. 2023. Zipformer: A faster and better encoder for automatic speech recognition. arXiv preprint arXiv:2310.11230 (2023)."},{"key":"e_1_3_2_1_33_1","volume-title":"Bertscore: Evaluating text generation with bert. arXiv preprint arXiv:1904.09675","author":"Zhang Tianyi","year":"2019","unstructured":"Tianyi Zhang, Varsha Kishore, Felix Wu, Kilian Q Weinberger, and Yoav Artzi. 2019. Bertscore: Evaluating text generation with bert. arXiv preprint arXiv:1904.09675 (2019)."}],"event":{"name":"CIKM '25: The 34th ACM International Conference on Information and Knowledge Management","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval","SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"],"location":"Seoul Republic of Korea","acronym":"CIKM '25"},"container-title":["Proceedings of the 34th ACM International Conference on Information and Knowledge Management"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746252.3761206","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T02:47:07Z","timestamp":1765507627000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746252.3761206"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,10]]},"references-count":33,"alternative-id":["10.1145\/3746252.3761206","10.1145\/3746252"],"URL":"https:\/\/doi.org\/10.1145\/3746252.3761206","relation":{},"subject":[],"published":{"date-parts":[[2025,11,10]]},"assertion":[{"value":"2025-11-10","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}