{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,26]],"date-time":"2026-05-26T23:07:25Z","timestamp":1779836845598,"version":"3.53.1"},"reference-count":39,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neural Networks"],"published-print":{"date-parts":[[2026,11]]},"DOI":"10.1016\/j.neunet.2026.109128","type":"journal-article","created":{"date-parts":[[2026,5,16]],"date-time":"2026-05-16T22:18:55Z","timestamp":1778969935000},"page":"109128","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["OACodec: Audio attribute disentanglement via orthogonal disentanglement and mutual information minimization"],"prefix":"10.1016","volume":"203","author":[{"given":"Yukun","family":"Qian","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Wenjie","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zehua","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Lianyu","family":"Zhou","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xuyi","family":"Zhuang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Mingjiang","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.neunet.2026.109128_bib0001","series-title":"Interspeech 2023","first-page":"2053","article-title":"Voice conversion with just nearest neighbors","author":"Baas","year":"2023"},{"key":"10.1016\/j.neunet.2026.109128_sbref0002","series-title":"Advances in neural information processing systems","first-page":"12449","article-title":"Wav2vec 2.0: A framework for self-supervised learning of speech representations","volume":"vol. 33","author":"Baevski","year":"2020"},{"key":"10.1016\/j.neunet.2026.109128_bib0003","series-title":"Icassp 2025 - 2025 ieee international conference on acoustics, speech and signal processing (icassp)","first-page":"1","article-title":"Learning source disentanglement in neural audio codec","author":"Bie","year":"2025"},{"key":"10.1016\/j.neunet.2026.109128_bib0004","doi-asserted-by":"crossref","unstructured":"Casanova, E., Davis, K., G\u00f6lge, E., G\u00f6knar, G., Gulea, I., Hart, L., Aljafari, A., Meyer, J., Morais, R., Olayemi, S., & Weber, J. (2024). XTTS: A massively multilingual zero-shot text-to-speech model. https:\/\/arxiv.org\/abs\/2406.04904.","DOI":"10.21437\/Interspeech.2024-2016"},{"key":"10.1016\/j.neunet.2026.109128_sbref0005","series-title":"Proceedings of the 39th international conference on machine learning","first-page":"2709","article-title":"YourTTS: Towards zero-shot multi-speaker TTS and zero-shot voice conversion for everyone","volume":"vol. 162","author":"Casanova","year":"2022"},{"key":"10.1016\/j.neunet.2026.109128_bib0006","doi-asserted-by":"crossref","first-page":"705","DOI":"10.1109\/TASLPRO.2025.3530270","article-title":"Neural codec language models are zero-shot text to speech synthesizers","volume":"33","author":"Chen","year":"2025","journal-title":"IEEE Transactions on Audio, Speech and Language Processing"},{"key":"10.1016\/j.neunet.2026.109128_sbref0007","series-title":"Advances in neural information processing systems","article-title":"InfoGAN: Interpretable representation learning by information maximizing generative adversarial nets","volume":"vol. 29","author":"Chen","year":"2016"},{"key":"10.1016\/j.neunet.2026.109128_sbref0008","series-title":"Proceedings of the 37th international conference on machine learning","first-page":"1779","article-title":"CLUB: A contrastive log-ratio upper bound of mutual information","volume":"vol. 119","author":"Cheng","year":"2020"},{"key":"10.1016\/j.neunet.2026.109128_sbref0009","series-title":"Advances in neural information processing systems","first-page":"16251","article-title":"Neural analysis and synthesis: Reconstructing speech from self-supervised representations","volume":"vol. 34","author":"Choi","year":"2021"},{"key":"10.1016\/j.neunet.2026.109128_sbref0010","series-title":"The eleventh international conference on learning representations","article-title":"NANSY++: Unified voice synthesis with neural analysis and synthesis","author":"Choi","year":"2023"},{"key":"10.1016\/j.neunet.2026.109128_bib0011","series-title":"2021\u202fIEEE Automatic speech recognition and understanding workshop (ASRU)","first-page":"244","article-title":"W2v-BERT: Combining contrastive learning and masked language modeling for self-supervised speech pre-training","author":"Chung","year":"2021"},{"issue":"12","key":"10.1016\/j.neunet.2026.109128_bib0012","doi-asserted-by":"crossref","first-page":"13297","DOI":"10.1109\/TCSVT.2024.3443122","article-title":"Disentangled representation learning with transmitted information bottleneck","volume":"34","author":"Dang","year":"2024","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology"},{"key":"10.1016\/j.neunet.2026.109128_bib0013","article-title":"High fidelity neural audio compression","author":"D\u00e9fossez","year":"2023","journal-title":"Transactions on Machine Learning Research"},{"key":"10.1016\/j.neunet.2026.109128_bib0014","series-title":"2015\u202fIEEE International conference on acoustics, speech and signal processing (ICASSP)","first-page":"5698","article-title":"Overview of the EVS codec architecture","author":"Dietz","year":"2015"},{"key":"10.1016\/j.neunet.2026.109128_bib0015","series-title":"2024\u202fIEEE Spoken language technology workshop (SLT)","first-page":"885","article-title":"Emilia: An extensive, multilingual, and diverse speech dataset for large-scale speech generation","author":"He","year":"2024"},{"key":"10.1016\/j.neunet.2026.109128_bib0016","doi-asserted-by":"crossref","first-page":"3451","DOI":"10.1109\/TASLP.2021.3122291","article-title":"HuBERT: Self-supervised speech representation learning by masked prediction of hidden units","volume":"29","author":"Hsu","year":"2021","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"issue":"9","key":"10.1016\/j.neunet.2026.109128_sbref0017","doi-asserted-by":"crossref","first-page":"7926","DOI":"10.1609\/aaai.v35i9.16967","article-title":"Ib-GAN: Disentangled representation learning with information bottleneck generative adversarial networks","volume":"35","author":"Jeon","year":"2021","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"key":"10.1016\/j.neunet.2026.109128_bib0018","unstructured":"Ji, S., Jiang, Z., Wang, W., Chen, Y., Fang, M., Zuo, J., Yang, Q., Cheng, X., Wang, Z., Li, R. et al. (2024). Wavtokenizer: An efficient acoustic discrete codec tokenizer for audio language modeling. arXiv preprint arXiv: 2408.16532."},{"key":"10.1016\/j.neunet.2026.109128_bib0019","series-title":"Icassp 2023 - 2023 ieee international conference on acoustics, speech and signal processing (icassp)","first-page":"1","article-title":"Disentangled feature learning for real-time neural speech coding","author":"Jiang","year":"2023"},{"key":"10.1016\/j.neunet.2026.109128_sbref0020","series-title":"Proceedings of the 41st international conference on machine learning","first-page":"22605","article-title":"Naturalspeech 3: Zero-shot speech synthesis with factorized codec and diffusion models","volume":"vol. 235","author":"Ju","year":"2024"},{"key":"10.1016\/j.neunet.2026.109128_bib0021","series-title":"Icassp 2020 - 2020 ieee international conference on acoustics, speech and signal processing (icassp)","first-page":"7669","article-title":"Libri-light: A benchmark for ASR with limited or no supervision","author":"Kahn","year":"2020"},{"key":"10.1016\/j.neunet.2026.109128_bib0022","unstructured":"Kingma, D. P., & Welling, M. (2022). Auto-encoding variational bayes. https:\/\/arxiv.org\/abs\/1312.6114."},{"issue":"7","key":"10.1016\/j.neunet.2026.109128_bib0023","doi-asserted-by":"crossref","first-page":"1324","DOI":"10.3390\/app9071324","article-title":"Joint detection and classification of singing voice melody using convolutional recurrent neural networks","volume":"9","author":"Kum","year":"2019","journal-title":"Applied Sciences"},{"key":"10.1016\/j.neunet.2026.109128_bib0024","series-title":"Advances in neural information processing systems","first-page":"27980","article-title":"High-fidelity audio compression with improved RVQGAN","volume":"vol. 36","author":"Kumar","year":"2023"},{"key":"10.1016\/j.neunet.2026.109128_sbref0025","series-title":"Proceedings of the 41st international conference on machine learning","first-page":"30479","article-title":"Smooth tchebycheff scalarization for multi-objective optimization","volume":"vol. 235","author":"Lin","year":"2024"},{"issue":"8","key":"10.1016\/j.neunet.2026.109128_bib0026","doi-asserted-by":"crossref","first-page":"10407","DOI":"10.1109\/TNNLS.2023.3241791","article-title":"Stdnet: Rethinking disentanglement learning with information theory","volume":"35","author":"Liu","year":"2024","journal-title":"IEEE Transactions on Neural Networks and Learning Systems"},{"key":"10.1016\/j.neunet.2026.109128_sbref0027","series-title":"Proceedings of the 36th international conference on machine learning","first-page":"4114","article-title":"Challenging common assumptions in the unsupervised learning of disentangled representations","volume":"vol. 97","author":"Locatello","year":"2019"},{"key":"10.1016\/j.neunet.2026.109128_sbref0028","series-title":"Proceedings of the 38th international conference on machine learning","first-page":"7748","article-title":"Meta-stylespeech : Multi-speaker adaptive text-to-speech generation","volume":"vol. 139","author":"Min","year":"2021"},{"key":"10.1016\/j.neunet.2026.109128_bib0029","series-title":"2015\u202fIEEE International conference on acoustics, speech and signal processing (ICASSP)","first-page":"5206","article-title":"LibriSpeech: An ASR corpus based on public domain audio books","author":"Panayotov","year":"2015"},{"key":"10.1016\/j.neunet.2026.109128_bib0030","series-title":"Acl","article-title":"VOICECRAFT: Zero-shot speech editing and text-to-speech in the wild","author":"Peng","year":"2024"},{"key":"10.1016\/j.neunet.2026.109128_bib0031","doi-asserted-by":"crossref","unstructured":"Polyak, A., Adi, Y., Copet, J., Kharitonov, E., Lakhotia, K., Hsu, W.-N., Mohamed, A., & Dupoux, E. (2021). Speech resynthesis from discrete disentangled self-supervised representations. arXiv preprint arXiv: 2104.00355.","DOI":"10.21437\/Interspeech.2021-475"},{"key":"10.1016\/j.neunet.2026.109128_bib0032","unstructured":"Qin, Z., Zhao, W., Yu, X., & Sun, X. (2024). OpenVoice: Versatile instant voice cloning. https:\/\/arxiv.org\/abs\/2312.01479."},{"key":"10.1016\/j.neunet.2026.109128_bib0033","series-title":"Interspeech 2022","first-page":"4521","article-title":"Utmos: Utokyo-sarulab system for voicemos challenge 2022","author":"Saeki","year":"2022"},{"key":"10.1016\/j.neunet.2026.109128_bib0034","doi-asserted-by":"crossref","unstructured":"Schneider, S., Baevski, A., Collobert, R., & Auli, M. (2019). Wav2vec: Unsupervised pre-training for speech recognition. arXiv preprint arXiv: 1904.05862.","DOI":"10.21437\/Interspeech.2019-1873"},{"issue":"12","key":"10.1016\/j.neunet.2026.109128_bib0035","doi-asserted-by":"crossref","first-page":"9677","DOI":"10.1109\/TPAMI.2024.3420937","article-title":"Disentangled representation learning","volume":"46","author":"Wang","year":"2024","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"10.1016\/j.neunet.2026.109128_bib0036","series-title":"The thirteenth international conference on learning representations","article-title":"MaskGCT: Zero-shot text-to-speech with masked generative codec transformer","author":"Wang","year":"2025"},{"key":"10.1016\/j.neunet.2026.109128_bib0037","doi-asserted-by":"crossref","unstructured":"Xu, Q., Baevski, A., & Auli, M. (2021). Simple and effective zero-shot cross-lingual phoneme recognition. arXiv preprint arXiv: 2109.11680.","DOI":"10.21437\/Interspeech.2022-60"},{"key":"10.1016\/j.neunet.2026.109128_bib0038","unstructured":"Yang, D., Liu, S., Huang, R., Tian, J., Weng, C., & Zou, Y. (2023). Hifi-codec: Group-residual vector quantization for high fidelity audio codec. arXiv preprint arXiv: 2305.02765."},{"key":"10.1016\/j.neunet.2026.109128_bib0039","series-title":"The twelfth international conference on learning representations","article-title":"SpeechTokenizer: Unified speech tokenizer for speech language models","author":"Zhang","year":"2024"}],"container-title":["Neural Networks"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608026005897?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608026005897?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,26]],"date-time":"2026-05-26T23:00:16Z","timestamp":1779836416000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0893608026005897"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,11]]},"references-count":39,"alternative-id":["S0893608026005897"],"URL":"https:\/\/doi.org\/10.1016\/j.neunet.2026.109128","relation":{},"ISSN":["0893-6080"],"issn-type":[{"value":"0893-6080","type":"print"}],"subject":[],"published":{"date-parts":[[2026,11]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"OACodec: Audio attribute disentanglement via orthogonal disentanglement and mutual information minimization","name":"articletitle","label":"Article Title"},{"value":"Neural Networks","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neunet.2026.109128","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"109128"}}