{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T18:32:05Z","timestamp":1776882725959,"version":"3.51.2"},"reference-count":27,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,11,7]],"date-time":"2024-11-07T00:00:00Z","timestamp":1730937600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,11,7]],"date-time":"2024-11-07T00:00:00Z","timestamp":1730937600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Nature Science Foundation of China","doi-asserted-by":"publisher","award":["62301521"],"award-info":[{"award-number":["62301521"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003995","name":"Anhui Provincial Natural Science Foundation","doi-asserted-by":"publisher","award":["2308085QF200"],"award-info":[{"award-number":["2308085QF200"]}],"id":[{"id":"10.13039\/501100003995","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["WK2100000033"],"award-info":[{"award-number":["WK2100000033"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,11,7]]},"DOI":"10.1109\/iscslp63861.2024.10800013","type":"proceedings-article","created":{"date-parts":[[2024,12,23]],"date-time":"2024-12-23T19:11:17Z","timestamp":1734981077000},"page":"676-680","source":"Crossref","is-referenced-by-count":4,"title":["APCodec+: A Spectrum-Coding-Based High-Fidelity and High-Compression-Rate Neural Audio Codec with Staged Training Paradigm"],"prefix":"10.1109","author":[{"given":"Hui-Peng","family":"Du","sequence":"first","affiliation":[{"name":"National Engineering Research Center of Speech and Language Information Processing, University of Science and Technology of China,Hefei"}]},{"given":"Yang","family":"Ai","sequence":"additional","affiliation":[{"name":"National Engineering Research Center of Speech and Language Information Processing, University of Science and Technology of China,Hefei"}]},{"given":"Rui-Chen","family":"Zheng","sequence":"additional","affiliation":[{"name":"National Engineering Research Center of Speech and Language Information Processing, University of Science and Technology of China,Hefei"}]},{"given":"Zhen-Hua","family":"Ling","sequence":"additional","affiliation":[{"name":"National Engineering Research Center of Speech and Language Information Processing, University of Science and Technology of China,Hefei"}]}],"member":"263","reference":[{"issue":"10","key":"ref1","first-page":"780","article-title":"ISO\/MPEG-1 audio: A generic standard for coding of high-quality digital audio","volume":"42","author":"Brandenburg","year":"1994","journal-title":"Journal of the Audio Engineering Society"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1976.1170021"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/TASSP.1986.1164946"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/25.312763"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3288409"},{"key":"ref6","article-title":"Neural codec language models are zero-shot text to speech synthesizers","author":"Wang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref7","article-title":"SpeechTok-enizer: Unified speech tokenizer for speech large language models","author":"Zhang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.314"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448454"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/MP.2006.1664069"},{"key":"ref11","article-title":"HiFi-Codec: Group-residual vector quantization for high fidelity audio codec","author":"Yang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref12","article-title":"Finite scalar quantization: Vq-vae made simple","volume-title":"The Twelfth International Conference on Learning Representations","author":"Mentzer","year":"2023"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/45.1890"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7179063"},{"key":"ref15","article-title":"High-quality, low-delay music coding in the opus codec","volume-title":"Audio En-gineering Society Convention 135","author":"Valin","year":"2013"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3129994"},{"key":"ref17","article-title":"High fidelity neural audio compression","author":"D\u00e9fossez","year":"2023","journal-title":"Transactions on Machine Learning Research"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3417347"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096509"},{"key":"ref20","first-page":"17022","article-title":"HiFi-GAN: Generative adversarial networks for efficient and high fidelity speech synthesis","volume-title":"Proc. NeurIPS","volume":"33","author":"Kong","year":"2020"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01548"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096553"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1016"},{"key":"ref24","article-title":"Ex-presso: A benchmark and analysis of discrete expressive speech resynthesis","volume-title":"arXiv preprint","author":"Nguyen","year":"2023"},{"key":"ref25","article-title":"Decoupled weight decay regularization","volume-title":"Proc. ICLR","author":"Loshchilov","year":"2018"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3519881"},{"key":"ref27","first-page":"4521","article-title":"UTMOS: UTokyo-SaruLab System for Voice-MOS Challenge 2022","volume-title":"Proc. Interspeech","author":"Saeki","year":"2022"}],"event":{"name":"2024 IEEE 14th International Symposium on Chinese Spoken Language Processing (ISCSLP)","location":"Beijing, China","start":{"date-parts":[[2024,11,7]]},"end":{"date-parts":[[2024,11,10]]}},"container-title":["2024 IEEE 14th International Symposium on Chinese Spoken Language Processing (ISCSLP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10799944\/10799969\/10800013.pdf?arnumber=10800013","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,24]],"date-time":"2024-12-24T06:26:12Z","timestamp":1735021572000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10800013\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,7]]},"references-count":27,"URL":"https:\/\/doi.org\/10.1109\/iscslp63861.2024.10800013","relation":{},"subject":[],"published":{"date-parts":[[2024,11,7]]}}}