{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T16:49:29Z","timestamp":1765039769846,"version":"3.28.0"},"reference-count":47,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,9,30]]},"DOI":"10.1109\/is262782.2024.10704173","type":"proceedings-article","created":{"date-parts":[[2024,10,7]],"date-time":"2024-10-07T17:42:20Z","timestamp":1728322940000},"page":"1-10","source":"Crossref","is-referenced-by-count":2,"title":["TinyVocos: Neural Vocoders on MCUs"],"prefix":"10.1109","author":[{"given":"Stefano","family":"Ciapponi","sequence":"first","affiliation":[{"name":"Energy Efficient Embedded Digital Architectures, Fondazione Bruno Kessler"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Francesco","family":"Paissan","sequence":"additional","affiliation":[{"name":"Energy Efficient Embedded Digital Architectures, Fondazione Bruno Kessler"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Alberto","family":"Ancilotto","sequence":"additional","affiliation":[{"name":"Energy Efficient Embedded Digital Architectures, Fondazione Bruno Kessler"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Elisabetta","family":"Farella","sequence":"additional","affiliation":[{"name":"Energy Efficient Embedded Digital Architectures, Fondazione Bruno Kessler"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"issue":"13","key":"ref1","doi-asserted-by":"crossref","first-page":"11264","DOI":"10.1109\/JIOT.2023.3253602","article-title":"The Internet of Sounds: Convergent Trends, Insights, and Future Directions","volume":"10","author":"Turchet","year":"2023","journal-title":"IEEE Internet of Things Journal"},{"issue":"2","key":"ref2","first-page":"30","article-title":"Energy-Efficient Audio Processing at the Edge for Biologging Applications","volume-title":"Journal of Low Power Electronics and Applications","volume":"13","author":"Miquel","year":"2023"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1145\/3446393"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1145\/3370748.3406588"},{"issue":"4","key":"ref5","first-page":"654","article-title":"Compact Recurrent Neural Networks for Acoustic Event Detection on Low-Energy Low-Complexity Platforms","volume-title":"IEEE Journal of Selected Topics in Signal Processing","volume":"14","author":"Cerutti","year":"2020"},{"volume-title":"Low-complexity acoustic scene classification in DCASE 2022 Challenge","year":"2022","author":"Mart\u00edn-Morat\u00f3","key":"ref6"},{"key":"ref7","first-page":"177","article-title":"On the challenges of embedded real-time music information retrieval","volume-title":"Proceedings of the International Conference on Digital Audio Effects (DAFx)","volume":"3","author":"Stefani"},{"key":"ref8","first-page":"1","article-title":"Ximswap: Many-to-many face swapping for tinyml","volume-title":"ACM Transactions on Embedded Computing Systems","volume":"23","author":"Ancilotto","year":"2023"},{"key":"ref9","article-title":"Graph-based generative face anonymisation with pose preservation","volume-title":"ArXiv","volume":"abs\/2112.05496","author":"DallAsen","year":"2021"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.54489\/ijcim.v1i1.34"},{"key":"ref11","first-page":"677","article-title":"Phinet-gan: Bringing real-time face swapping to embedded devices","volume-title":"2023 IEEE International Conference on Pervasive Computing and Communications Workshops and other Affiliated Events (PerCom Workshops)","author":"Ancilotto"},{"key":"ref12","article-title":"Audioldm: Text-to-audio generation with latent diffusion models","volume-title":"International Conference on Machine Learning","author":"Liu","year":"2023"},{"key":"ref13","first-page":"2871","article-title":"Audioldm 2: Learning holistic audio generation with self-supervised pretraining","volume-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing","volume":"32","author":"Liu","year":"2023"},{"key":"ref14","first-page":"30 923","article-title":"Tango: Text-driven photorealistic and robust 3d stylization via lighting decomposition","volume-title":"Advances in Neural Information Processing Systems","volume":"35","author":"Chen","year":"2022"},{"article-title":"Make-an-audio: text-to-audio generation with prompt-enhanced diffusion models","volume-title":"Proceedings of the 40th International Conference on Machine Learning","author":"Huang","key":"ref15"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461671"},{"key":"ref17","first-page":"4006","article-title":"Tacotron: Towards end-to-end speech synthesis","volume-title":"INTERSPEECH","author":"Wang","year":"2017"},{"key":"ref18","doi-asserted-by":"crossref","DOI":"10.21437\/Interspeech.2024-636","article-title":"Audio editing with non-rigid text prompts","volume-title":"Interspeech","author":"Paissan","year":"2024"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/WASPAA.2013.6701851"},{"key":"ref20","article-title":"Hifi-gan: generative adversarial networks for efficient and high fidelity speech synthesis","volume-title":"Proceedings of the 34th International Conference on Neural Information Processing Systems","author":"Kong","year":"2020"},{"key":"ref21","article-title":"Hifi-gan: High-fidelity denoising and dereverberation based on speech deep features in adversarial networks","volume-title":"ArXiv","volume":"abs\/2006.05694","author":"Su","year":"2020"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688194"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2022-10294"},{"key":"ref24","doi-asserted-by":"crossref","first-page":"225","DOI":"10.21437\/Interspeech.2020-2172","article-title":"Efficient WaveGlow: An Improved WaveGlow Vocoder with Enhanced Speed","volume-title":"Interspeech 2020. ISCA","author":"Song","year":"2020"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/iccv51070.2023.01556"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1145\/3510832"},{"key":"ref27","article-title":"Wavenet: A generative model for raw audio","volume-title":"ArXiv","volume":"abs\/1609.03499","author":"van den Oord","year":"2016"},{"key":"ref28","article-title":"SampleRNN: An unconditional end-to-end neural audio generation model","volume-title":"International Conference on Learning Representations","author":"Mehri","year":"2017"},{"key":"ref29","first-page":"3617","article-title":"Waveglow: A flow-based generative network for speech synthesis","volume-title":"ICASSP 2019 \u2013 2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","author":"Prenger","year":"2018"},{"key":"ref30","article-title":"Improved variational inference with inverse autoregressive flow","volume-title":"Advances in Neural Information Processing Systems","volume":"29","author":"Kingma","year":"2016"},{"key":"ref31","article-title":"Melgan: Generative adversarial networks for conditional waveform synthesis","volume-title":"Neural Information Processing Systems","author":"Kumar","year":"2019"},{"key":"ref32","doi-asserted-by":"crossref","first-page":"1883","DOI":"10.1109\/WACV45572.2020.9093264","article-title":"Multi receptive field network for semantic segmentation","volume-title":"2020 IEEE Winter Conference on Applications of Computer Vision (WACV)","author":"Yuan","year":"2020"},{"key":"ref33","doi-asserted-by":"crossref","first-page":"2222","DOI":"10.21437\/Interspeech.2021-2173","article-title":"Basis-MelGAN: Efficient Neural Vocoder Based on Audio Decomposition","volume-title":"Interspeech 2021. ISCA","author":"Liu","year":"2021"},{"key":"ref34","doi-asserted-by":"crossref","first-page":"696","DOI":"10.1109\/ICASSP.2018.8462116","article-title":"TaSNet: Time-Domain Audio Separation Network for Real-Time, Single-Channel Speech Separation","volume-title":"2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","author":"Luo","year":"2018"},{"key":"ref35","doi-asserted-by":"crossref","first-page":"6034","DOI":"10.1109\/ICASSP39728.2021.9413605","article-title":"StyleMelGAN: An Efficient High-Fidelity Adversarial Vocoder with Temporal Adaptive Normalization","volume-title":"ICASSP 2021 \u2013 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","author":"Mustafa","year":"2021"},{"key":"ref36","doi-asserted-by":"crossref","first-page":"2251","DOI":"10.1109\/ICASSP.2018.8462431","article-title":"Fftnet: A real-time speaker-dependent neural vocoder","volume-title":"2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","author":"Jin","year":"2018"},{"key":"ref37","article-title":"Squeezewave: Extremely lightweight vocoders for on-device speech synthesis","volume-title":"ArXiv","volume":"abs\/2001.05685","author":"Zhai","year":"2020"},{"article-title":"Vocos: Closing the gap between time-domain and fourier-based neural vocoders for high-quality audio synthesis","volume-title":"arXiv preprint","author":"Siuzdak","key":"ref38"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00474"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2441"},{"volume-title":"The lj speech dataset","year":"2017","author":"Ito","key":"ref41"},{"issue":"10","key":"ref42","article-title":"Perceptual evaluation of speech quality (pesq) - the new itu standard for end-to-end speech quality assessment - part ii - psychoacoustic model","volume":"50","author":"Beerends","year":"2002","journal-title":"Journal of the Audio Engineering Society. Audio Engineering Society"},{"key":"ref43","first-page":"4521","article-title":"UTMOS: utokyo-sarulab system for voicemos challenge 2022","volume-title":"Interspeech 2022, 23rd Annual Conference of the International Speech Communication Association, Incheon, Korea, 18\u201322 September 2022","author":"Saeki","year":"2022"},{"key":"ref44","first-page":"12449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume":"33","author":"Baevski","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref45","article-title":"Cross-domain neural pitch and periodicity estimation","volume-title":"ArXiv","volume":"abs\/2301.12258","author":"Morrison","year":"2023"},{"key":"ref46","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1109\/QoMEX48832.2020.9123150","article-title":"Visqol v3: An open source production ready objective speech and audio metric","volume-title":"2020 Twelfth International Conference on Quality of Multimedia Experience (QoMEX)","author":"Chinen","year":"2020"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.5334\/jors.187"}],"event":{"name":"2024 IEEE 5th International Symposium on the Internet of Sounds (IS2)","start":{"date-parts":[[2024,9,30]]},"location":"Erlangen, Germany","end":{"date-parts":[[2024,10,2]]}},"container-title":["2024 IEEE 5th International Symposium on the Internet of Sounds (IS2)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10704037\/10704076\/10704173.pdf?arnumber=10704173","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,8]],"date-time":"2024-10-08T17:34:16Z","timestamp":1728408856000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10704173\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,30]]},"references-count":47,"URL":"https:\/\/doi.org\/10.1109\/is262782.2024.10704173","relation":{},"subject":[],"published":{"date-parts":[[2024,9,30]]}}}