{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,31]],"date-time":"2026-03-31T13:52:34Z","timestamp":1774965154673,"version":"3.50.1"},"reference-count":69,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/"}],"funder":[{"name":"Institute of Information &amp; communications Technology Planning &amp; Evaluation"},{"name":"Korea Government","award":["2019-0-00079"],"award-info":[{"award-number":["2019-0-00079"]}]},{"name":"Artificial Intelligence Graduate School Program","award":["2021-0-02068"],"award-info":[{"award-number":["2021-0-02068"]}]},{"name":"Artificial Intelligence Innovation Hub"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE\/ACM Trans. Audio Speech Lang. Process."],"published-print":{"date-parts":[[2024]]},"DOI":"10.1109\/taslp.2023.3349053","type":"journal-article","created":{"date-parts":[[2024,1,5]],"date-time":"2024-01-05T19:50:41Z","timestamp":1704484241000},"page":"1012-1022","source":"Crossref","is-referenced-by-count":20,"title":["Audio Super-Resolution With Robust Speech Representation Learning of Masked Autoencoder"],"prefix":"10.1109","volume":"32","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2287-9111","authenticated-orcid":false,"given":"Seung-Bin","family":"Kim","sequence":"first","affiliation":[{"name":"Department of Artificial Intelligence, Korea University, Seoul, South Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8925-4474","authenticated-orcid":false,"given":"Sang-Hoon","family":"Lee","sequence":"additional","affiliation":[{"name":"Department of Artificial Intelligence, Korea University, Seoul, South Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2390-7628","authenticated-orcid":false,"given":"Ha-Yeong","family":"Choi","sequence":"additional","affiliation":[{"name":"Department of Artificial Intelligence, Korea University, Seoul, South Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6249-4996","authenticated-orcid":false,"given":"Seong-Whan","family":"Lee","sequence":"additional","affiliation":[{"name":"Department of Artificial Intelligence, Korea University, Seoul, South Korea"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2006.885934"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2011.2118206"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2015.2470560"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2016.2519146"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-36"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-11026"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-45"},{"key":"ref8","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Dosovitskiy","year":"2021"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"ref10","first-page":"28708","article-title":"Masked autoencoders that listen","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Huang","year":"2022"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10961"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-555"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2018.2798811"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3054302"},{"key":"ref15","article-title":"Audio super-resolution using neural networks","volume-title":"Proc. 5th Int. Conf. Learn. Representations","author":"Kuleshov","year":"2017"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462049"},{"key":"ref17","article-title":"Temporal FiLM: Capturing long-range sequence dependencies with feature-wise modulations.","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Birnbaum","year":"2019"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413439"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1563"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747699"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-892"},{"key":"ref22","first-page":"125","article-title":"WaveNet: A generative model for raw audio","volume-title":"Proc. 9th ISCA Workshop Speech Synth. Workshop","author":"Oord","year":"2016"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413575"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"ref25","article-title":"TFGAN: Time and frequency domain based generative adversarial network for high-fidelity speech synthesis","author":"Tian","year":"2020"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-11017"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462588"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682215"},{"key":"ref29","article-title":"Bandwidth extension on raw audio via generative adversarial networks","author":"Kim","year":"2019"},{"key":"ref30","article-title":"Nu-GAN: High resolution neural upsampling with GAN","author":"Kumar","year":"2020"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2605"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095382"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2022.3190726"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-113"},{"key":"ref35","article-title":"Diffwave: A versatile diffusion model for audio synthesis","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Kong","year":"2021"},{"key":"ref36","article-title":"WaveGrad: Estimating gradients for waveform generation","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Chen","year":"2021"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095103"},{"key":"ref38","article-title":"AudioSR: Versatile audio super-resolution at scale","author":"Liu","year":"2023"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1582"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2380"},{"key":"ref41","first-page":"12449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Baevski","year":"2020"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3275033"},{"key":"ref44","first-page":"16624","article-title":"HierSpeech: Bridging the gap between text and speech by hierarchical variational inference using self-supervised representations for speech synthesis","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Lee","year":"2022"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2022.3162078"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3133189"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.235"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref49","article-title":"WaveNet: A generative model for raw audio","author":"Oord","year":"2016"},{"key":"ref50","first-page":"2410","article-title":"Efficient neural audio synthesis","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Kalchbrenner","year":"2018"},{"key":"ref51","first-page":"3918","article-title":"Parallel wavenet: Fast high-fidelity speech synthesis","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Oord","year":"2018"},{"key":"ref52","article-title":"MelGAN: Generative adversarial networks for conditional waveform synthesis","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Kumar","year":"2019"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053795"},{"key":"ref54","first-page":"17022","article-title":"HiFi-GAN: Generative adversarial networks for efficient and high fidelity speech synthesis","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Kong","year":"2020"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1016"},{"key":"ref56","article-title":"High fidelity neural audio compression","author":"Dfossez","year":"2023","journal-title":"Trans. Mach. Learn. Res."},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-845"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746675"},{"key":"ref59","article-title":"BigVGAN: A universal neural vocoder with large-scale training","volume-title":"Proc. 11th Int. Conf. Learn. Representations","author":"gil Lee","year":"2023"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2441"},{"key":"ref62","article-title":"CSTR VCTK Corpus: English multi-speaker corpus for CSTR voice cloning toolkit (version 0.92)","author":"Yamagishi","year":"2019"},{"key":"ref63","article-title":"Decoupled weight decay regularization","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Loshchilov","year":"2019"},{"key":"ref64","article-title":"End-to-end adversarial text-to-speech","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Donahue","year":"2021"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/QoMEX48832.2020.9123150"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3288409"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2012.291"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i14.17559"},{"key":"ref69","first-page":"8067","article-title":"Glow-TTS: A. generative flow for text-to-speech via monotonic alignment search","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Kim","year":"2020"}],"container-title":["IEEE\/ACM Transactions on Audio, Speech, and Language Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6570655\/10304349\/10381805.pdf?arnumber=10381805","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,18]],"date-time":"2025-04-18T17:38:30Z","timestamp":1744997910000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10381805\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"references-count":69,"URL":"https:\/\/doi.org\/10.1109\/taslp.2023.3349053","relation":{},"ISSN":["2329-9290","2329-9304"],"issn-type":[{"value":"2329-9290","type":"print"},{"value":"2329-9304","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]}}}