{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:13:25Z","timestamp":1763190805450,"version":"3.45.0"},"reference-count":37,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100006180","name":"Technology Development","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100006180","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1109\/ijcnn64981.2025.11227679","type":"proceedings-article","created":{"date-parts":[[2025,11,14]],"date-time":"2025-11-14T18:46:15Z","timestamp":1763145975000},"page":"1-8","source":"Crossref","is-referenced-by-count":0,"title":["AMNet: An Acoustic Model Network for Enhanced Mandarin Speech Synthesis"],"prefix":"10.1109","author":[{"given":"Yubing","family":"Cao","sequence":"first","affiliation":[{"name":"Xinjiang University,Xinjiang Multimodal Intelligent Processing and Information Security Engineering Technology Research Center, School of Computer Science and Technology,China"}]},{"given":"Yinfeng","family":"Yu","sequence":"additional","affiliation":[{"name":"Xinjiang University,Xinjiang Multimodal Intelligent Processing and Information Security Engineering Technology Research Center, School of Computer Science and Technology,China"}]},{"given":"Yongming","family":"Li","sequence":"additional","affiliation":[{"name":"Xinjiang University,Xinjiang Multimodal Intelligent Processing and Information Security Engineering Technology Research Center, School of Computer Science and Technology,China"}]},{"given":"Liejun","family":"Wang","sequence":"additional","affiliation":[{"name":"Xinjiang University,Xinjiang Multimodal Intelligent Processing and Information Security Engineering Technology Research Center, School of Computer Science and Technology,China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/tasl.2011.2134090"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2014.6854318"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178816"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1162\/neco_a_01579"},{"article-title":"Sound adversarial audio-visual navigation","volume-title":"Proc. The Tenth International Conference on Learning Representations, ICLR 2022, Virtual Event","author":"Yu","key":"ref5"},{"article-title":"Pay self-attention to audiovisual navigation","volume-title":"Proc. 33rd British Machine Vision Conference 2022, BMVC 2022","author":"Yu","key":"ref6"},{"key":"ref7","first-page":"3104","article-title":"Sequence to sequence learning with neural networks","volume-title":"Proc. Advances in Neural Information Processing Systems (NeurIPS)","author":"Sutskever"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-134"},{"issue":"7","key":"ref9","first-page":"1756","article-title":"Char2wav: End-to-end speech synthesis","volume":"25","author":"Wang","year":"2017","journal-title":"IEEE Trans. Audio, Speech, Lang. Process."},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1452"},{"article-title":"Deep voice 3: 2000-speaker neural text-to-speech","year":"2017","author":"Wang","key":"ref11"},{"issue":"3","key":"ref12","first-page":"567","article-title":"Learning internal representations by error propagation, parallel distributed processing, explorations in the microstructure of cognition","volume":"28","author":"Graves","year":"2018","journal-title":"Journal of Cognitive Neuroscience"},{"issue":"8","key":"ref13","first-page":"3054","article-title":"Fastspeech: Fast, robust and controllable text-to-speech","volume":"31","author":"Wang","year":"2020","journal-title":"IEEE Trans. Neural Networks Learn. Syst."},{"issue":"5","key":"ref14","first-page":"1035","article-title":"Fastspeech 2: Fast and high-quality end-to-end text-to-speech","volume":"32","author":"Wang","year":"2021","journal-title":"IEEE Trans. Neural Networks Learn. Syst."},{"issue":"3","key":"ref15","first-page":"341","article-title":"Modeling localness for self-attention networks","volume":"19","author":"Jain","year":"2019","journal-title":"IEEE Trans. Audio, Speech, Lang. Process."},{"issue":"9","key":"ref16","first-page":"2079","article-title":"On the localness modeling for the self-attention based end-to-end speech synthesis","volume":"52","author":"Yu","year":"2018","journal-title":"Speech Communication"},{"issue":"3","key":"ref17","first-page":"871","article-title":"SeDepTTS: Enhancing the naturalness via semantic dependency and local convolution for text-to-speech synthesis","volume":"32","author":"Li","year":"2021","journal-title":"IEEE Trans. Neural Networks Learn. Syst."},{"issue":"6","key":"ref18","first-page":"902","article-title":"Audio-visual automatic speech recognition using PZM, MFCC and statistical analysis","volume":"19","author":"Xie","year":"2020","journal-title":"IEEE Trans. Audio, Speech, Lang. Process."},{"key":"ref19","first-page":"459","article-title":"Tone learning in low-resource bilingual TTS","volume":"34","author":"Zhang","year":"2019","journal-title":"Speech Communication"},{"key":"ref20","first-page":"542","article-title":"Improved syllable-based text-to-speech synthesis for tone language systems","volume":"29","author":"Li","year":"2020","journal-title":"IEEE Trans. Audio, Speech, Lang. Process."},{"key":"ref21","first-page":"88","article-title":"Probing the phonetic and phonological knowledge of tones in Mandarin TTS models","volume":"32","author":"Li","year":"2021","journal-title":"IEEE Trans. Audio, Speech, Lang. Process."},{"issue":"4","key":"ref22","first-page":"656","article-title":"Implementing prosodic phrasing in Chinese end-to-end speech synthesis","volume":"27","author":"Tan","year":"2019","journal-title":"IEEE Trans. Audio, Speech, Lang. Process."},{"key":"ref23","first-page":"1072","article-title":"Natural TTS synthesis by conditioning WaveNet on mel spectrogram predictions","volume":"25","author":"Zhang","year":"2018","journal-title":"IEEE Trans. Audio, Speech, Lang. Process."},{"article-title":"WaveNet: A generative model for raw audio","year":"2016","author":"Van Den Oord","key":"ref24"},{"key":"ref25","first-page":"144","article-title":"ToneNet: A CNN model of tone classification of Mandarin Chinese","volume":"28","author":"Liu","year":"2019","journal-title":"IEEE Trans. Audio, Speech, Lang. Process."},{"issue":"1","key":"ref26","first-page":"190","article-title":"End-to-end emotional speech synthesis using style tokens and semi-supervised training","volume":"32","author":"Yao","year":"2021","journal-title":"IEEE Trans. Neural Networks Learn. Syst."},{"issue":"6","key":"ref27","first-page":"768","article-title":"Joint Chinese word segmentation and part-of-speech tagging via two-stage span labeling","volume":"38","author":"Chen","year":"2020","journal-title":"IEEE Trans. Audio, Speech, Lang. Process."},{"issue":"7","key":"ref28","first-page":"2031","article-title":"A unified sequence-to-sequence front-end model for Mandarin text-to-speech synthesis","volume":"28","author":"Wang","year":"2021","journal-title":"IEEE Trans. Audio, Speech, Lang. Process."},{"issue":"5","key":"ref29","first-page":"1225","article-title":"Knowledge-based linguistic encoding for end-to-end Mandarin text-to-speech synthesis","volume":"29","author":"Zhang","year":"2020","journal-title":"IEEE Trans. Audio, Speech, Lang. Process."},{"issue":"8","key":"ref30","first-page":"1017","article-title":"A tutorial on hidden Markov models and selected applications in speech recognition","volume":"31","author":"Brown","year":"2018","journal-title":"IEEE Trans. Speech Audio Process."},{"key":"ref31","first-page":"330","article-title":"The Viterbi algorithm","volume":"6","author":"Viterbi","year":"1967","journal-title":"IEEE Trans. Inform. Theory"},{"key":"ref32","first-page":"1059","article-title":"Language modeling with gated convolutional networks","volume":"48","author":"Goodfellow","year":"2019","journal-title":"Journal of Machine Learning Research"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref34","first-page":"3210","article-title":"HiFi-GAN: Generative adversarial networks for efficient and high fidelity speech synthesis","volume":"29","author":"Tan","year":"2020","journal-title":"IEEE Trans. Audio, Speech, Lang. Process."},{"issue":"3","key":"ref35","first-page":"655","article-title":"Improving Mandarin end-to-end speech synthesis by self-attention and learnable Gaussian bias","volume":"32","author":"Zhang","year":"2021","journal-title":"IEEE Trans. Neural Networks Learn. Syst."},{"issue":"4","key":"ref36","first-page":"1345","article-title":"A kernelized Stein discrepancy for goodness-of-fit tests","volume":"48","author":"Munoz","year":"2019","journal-title":"Journal of Statistical Computation and Simulation"},{"key":"ref37","first-page":"845","article-title":"Yin, a fundamental frequency estimator for speech and music","volume":"28","author":"Wong","year":"2018","journal-title":"IEEE Trans. Audio, Speech, Lang. Process."}],"event":{"name":"2025 International Joint Conference on Neural Networks (IJCNN)","start":{"date-parts":[[2025,6,30]]},"location":"Rome, Italy","end":{"date-parts":[[2025,7,5]]}},"container-title":["2025 International Joint Conference on Neural Networks (IJCNN)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11227166\/11227148\/11227679.pdf?arnumber=11227679","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:10:48Z","timestamp":1763190648000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11227679\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":37,"URL":"https:\/\/doi.org\/10.1109\/ijcnn64981.2025.11227679","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]}}}