{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,29]],"date-time":"2025-11-29T06:51:21Z","timestamp":1764399081797,"version":"3.46.0"},"reference-count":30,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,22]],"date-time":"2025-10-22T00:00:00Z","timestamp":1761091200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,22]],"date-time":"2025-10-22T00:00:00Z","timestamp":1761091200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,22]]},"DOI":"10.1109\/apsipaasc65261.2025.11249045","type":"proceedings-article","created":{"date-parts":[[2025,11,28]],"date-time":"2025-11-28T18:40:26Z","timestamp":1764355226000},"page":"897-902","source":"Crossref","is-referenced-by-count":0,"title":["BAANI: A 296M-Parameter Neural Vocoder for End-To-End Punjabi Speech Synthesis"],"prefix":"10.1109","author":[{"given":"Siddharth","family":"Kumar","sequence":"first","affiliation":[{"name":"Dhirubhai Ambani University (formerly DA-IICT),Speech Research Lab,Gandhinagar,GJ,India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Nisarg","family":"Trivedi","sequence":"additional","affiliation":[{"name":"Dhirubhai Ambani University (formerly DA-IICT),Speech Research Lab,Gandhinagar,GJ,India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ravindrakumar M.","family":"Purohit","sequence":"additional","affiliation":[{"name":"Dhirubhai Ambani University (formerly DA-IICT),Speech Research Lab,Gandhinagar,GJ,India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hemant A.","family":"Patil","sequence":"additional","affiliation":[{"name":"Dhirubhai Ambani University (formerly DA-IICT),Speech Research Lab,Gandhinagar,GJ,India"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Wavenet: A generative model for raw audio","author":"Oord","year":"2016","journal-title":"arXiv preprint"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref3","first-page":"3171","article-title":"Fastspeech: Fast, robust and controllable text to speech","volume-title":"Advances in Neural Information Processing Systems (NIPS)","volume":"32","author":"Ren","year":"2019"},{"key":"ref4","first-page":"5530","article-title":"Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech","volume-title":"International Conference on Machine Learning","author":"Kim","year":"2021"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683143"},{"key":"ref6","article-title":"Diffwave: A versatile diffusion model for audio synthesis","volume-title":"arXiv preprint","author":"Kong","year":"2020"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2022\/577"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053795"},{"key":"ref9","article-title":"Hifi-gan: Generative adversarial network for efficient and high fidelity speech synthesis","volume-title":"in Advances in Neural Information Processing Systems (NeurIPS)","author":"Kong","year":"2020"},{"key":"ref10","article-title":"Generative adversarial nets","volume-title":"in Advances in Neural Information Processing Systems","volume":"27","author":"Goodfellow","year":"2014"},{"key":"ref11","article-title":"BigVgan: A universal neural vocoder with large-scale training","author":"Lee","year":"2022","journal-title":"arXiv preprint"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1238"},{"key":"ref13","article-title":"Universal melgan: A robust neural vocoder for high-fidelity waveform generation in multiple domains","author":"Jang","year":"2020","journal-title":"arXiv preprint"},{"key":"ref14","article-title":"Improved techniques for training gans","volume-title":"in Advances in Neural Information Processing Systems","volume":"29","author":"Salimans","year":"2016"},{"volume-title":"Indic TTS: A text-to-speech database for indian languages","year":"2023","author":"Murthy","key":"ref15"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1016"},{"key":"ref17","first-page":"2672","article-title":"MelGAN: Generative adversarial networks for conditional waveform synthesis","volume-title":"Advances in Neural Information Processing Systems (NIPS)","volume":"32","author":"Kumar","year":"2019"},{"article-title":"Bigvgan: A universal neural vocoder with large-scale training","volume-title":"IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","author":"Lee","key":"ref18"},{"volume-title":"Nvidia geforce gtx 1080 review","year":"2016","key":"ref19"},{"journal-title":"ITU-T, Tech. Rep.","article-title":"ITU-T Recommendation P.800: Methods for subjective determination of transmission quality","year":"1996","key":"ref20"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-299"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2001.941023"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/PACRIM.1993.407206"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2011.2114881"},{"key":"ref25","article-title":"A statistical analysis of modulation spectra for reverberant speech recognition","author":"Komatsu","year":"2016","journal-title":"in Interspeech"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.4324\/9780203771587"},{"issue":"4","key":"ref27","first-page":"35","article-title":"Modern information retrieval: A brief overview","volume":"24","author":"Singhal","year":"2001","journal-title":"IEEE Data Engineering Bulletin"},{"volume-title":"Robust speech recognition via large-scale weak supervision","year":"2023","author":"Radford","key":"ref28"},{"key":"ref29","first-page":"248","article-title":"Words Worth: How robust automatic speech recognition is to speech variations","volume-title":"in IEEE Workshop on Spoken Language Technology (SLT)","author":"Goldwater","year":"2010"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472621"}],"event":{"name":"2025 Asia Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC)","start":{"date-parts":[[2025,10,22]]},"location":"Singapore, Singapore","end":{"date-parts":[[2025,10,24]]}},"container-title":["2025 Asia Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11248853\/11248968\/11249045.pdf?arnumber=11249045","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,29]],"date-time":"2025-11-29T06:50:07Z","timestamp":1764399007000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11249045\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,22]]},"references-count":30,"URL":"https:\/\/doi.org\/10.1109\/apsipaasc65261.2025.11249045","relation":{},"subject":[],"published":{"date-parts":[[2025,10,22]]}}}