{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,8]],"date-time":"2025-12-08T22:41:22Z","timestamp":1765233682556},"reference-count":42,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2023,12,12]],"date-time":"2023-12-12T00:00:00Z","timestamp":1702339200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,12,12]],"date-time":"2023-12-12T00:00:00Z","timestamp":1702339200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Intell Inf Syst"],"published-print":{"date-parts":[[2024,8]]},"DOI":"10.1007\/s10844-023-00833-w","type":"journal-article","created":{"date-parts":[[2023,12,12]],"date-time":"2023-12-12T09:14:42Z","timestamp":1702372482000},"page":"1071-1085","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Audio super-resolution via vision transformer"],"prefix":"10.1007","volume":"62","author":[{"given":"Simona","family":"Nistic\u00f2","sequence":"first","affiliation":[]},{"given":"Luigi","family":"Palopoli","sequence":"additional","affiliation":[]},{"given":"Adele Pia","family":"Romano","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,12,12]]},"reference":[{"key":"833_CR1","doi-asserted-by":"crossref","unstructured":"Andreev, P., Alanov, A., Ivanov, O., & Vetrov, D. (2022). Hifi++: A unified framework for neural vocoding, bandwidth extension and speech enhancement. Preprint retrieved from http:\/\/arxiv.org\/abs\/2203.13086","DOI":"10.1109\/ICASSP49357.2023.10097255"},{"issue":"4","key":"833_CR2","doi-asserted-by":"publisher","first-page":"421","DOI":"10.1109\/10.486262","volume":"43","author":"S Charleston","year":"1996","unstructured":"Charleston, S., & Azimi-Sadjadi, M. R. (1996). Reduced order Kalman filtering for the enhancement of respiratory sounds. IEEE Transactions on Biomedical Engineering, 43(4), 421\u2013424.","journal-title":"IEEE Transactions on Biomedical Engineering"},{"key":"833_CR3","doi-asserted-by":"crossref","unstructured":"Chen, X., & Yang, J. (2021). Speech bandwidth extension based on Wasserstein generative adversarial network. In: 2021 IEEE 21st International Conference on Communication Technology (ICCT) (pp. 1356\u20131362). IEEE.","DOI":"10.1109\/ICCT52962.2021.9658055"},{"key":"833_CR4","unstructured":"Choi, H.-S., Kim, J.-H., Huh, J., Kim, A., Ha, J.-W., & Lee, K. (2018). Phase-aware speech enhancement with deep complex u-net. In International Conference on Learning Representations."},{"key":"833_CR5","doi-asserted-by":"crossref","unstructured":"Dai, J., Zhang, Y., Xie, P., & Xu, X. (2021). Super-resolution for music signals using generative adversarial networks. In 2021 IEEE 4th International Conference on Big Data and Artificial Intelligence (BDAI) (pp. 1\u20135). IEEE.","DOI":"10.1109\/BDAI52447.2021.9515219"},{"key":"833_CR6","unstructured":"Defferrard, M., Benzi, K., Vandergheynst, P., & Bresson, X. (2016) FMA: A dataset for music analysis. Preprint retrieved from arXiv:1612.01840"},{"issue":"4","key":"833_CR7","doi-asserted-by":"publisher","first-page":"1095","DOI":"10.1007\/s00521-019-04158-0","volume":"32","author":"J Deng","year":"2020","unstructured":"Deng, J., Schuller, B., Eyben, F., Schuller, D., Zhang, Z., Francois, H., & Oh, E. (2020). Exploiting time-frequency patterns with LSTM-RNNS for low-bitrate audio restoration. Neural Computing and Applications, 32(4), 1095\u20131107.","journal-title":"Neural Computing and Applications"},{"key":"833_CR8","doi-asserted-by":"crossref","unstructured":"Erell, A., & Weintraub, M. (1990).Estimation using log-spectral-distance criterion for noise-robust speech recognition. In International Conference on Acoustics, Speech, and Signal Processing (pp. 853\u2013856). IEEE.","DOI":"10.1109\/ICASSP.1991.150487"},{"key":"833_CR9","doi-asserted-by":"publisher","first-page":"108772","DOI":"10.1016\/j.apacoust.2022.108772","volume":"194","author":"T Fujimura","year":"2022","unstructured":"Fujimura, T., & Miyazaki, R. (2022). Removal of musical noise using deep speech prior. Applied Acoustics, 194, 108772.","journal-title":"Applied Acoustics"},{"key":"833_CR10","doi-asserted-by":"crossref","unstructured":"Gong, Y., Chung, Y.-A., & Glass, J. (2021). AST: Audio Spectrogram Transformer.","DOI":"10.21437\/Interspeech.2021-698"},{"key":"833_CR11","unstructured":"Guo, M.-H., Xu, T.-X., Liu, J.-J., Liu, Z.-N., Jiang, P.-T., Mu, T.-J., Zhang, S.-H., Martin, R. R., Cheng, M.-M., & Hu, S.-M. (2021). Attention mechanisms in computer vision: A survey. Preprint retrieved from http:\/\/arxiv.org\/abs\/2111.07624"},{"key":"833_CR12","unstructured":"https:\/\/huggingface.co\/docs\/transformers\/index"},{"key":"833_CR13","unstructured":"Huang, C.-Z. A., Vaswani, A., Uszkoreit, J., Shazeer, N., Simon, I., Hawthorne, C., Dai, A. M., Hoffman, M. D., Dinculescu, M., & Eck, D. (2018). Music transformer. Preprint retrieved from http:\/\/arxiv.org\/abs\/1809.04281"},{"issue":"12","key":"833_CR14","doi-asserted-by":"publisher","first-page":"2088","DOI":"10.4249\/scholarpedia.2088","volume":"1","author":"DH Johnson","year":"2006","unstructured":"Johnson, D. H. (2006). Signal-to-noise ratio. Scholarpedia, 1(12), 2088.","journal-title":"Scholarpedia"},{"key":"833_CR15","doi-asserted-by":"crossref","unstructured":"Kim, J., Englebienne, G., Truong, K. P., & Evers, V. (2017). Deep temporal models using identity skip-connections for speech emotion recognition. In E. A. Laurent\u00a0Amsaleg & B. Huet (Eds.) Proceedings of the 25th ACM International Conference on Multimedia (pp. 1006\u20131013).","DOI":"10.1145\/3123266.3123353"},{"key":"833_CR16","unstructured":"Kingma, D. P., & Ba, J. (2014). Adam: A method for stochastic optimization. Preprint retrieved from http:\/\/arxiv.org\/abs\/1412.6980"},{"key":"833_CR17","unstructured":"Kolesnikov, A., Dosovitskiy, A., Weissenborn, D., Heigold, G., Uszkoreit, J., Beyer, L., Minderer, M., Dehghani, M., Houlsby, N., Gelly, S., et al. (2021). An image is worth 16x16 words: Transformers for image recognition at scale."},{"key":"833_CR18","first-page":"17022","volume":"33","author":"J Kong","year":"2020","unstructured":"Kong, J., Kim, J., & Bae, J. (2020). Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis. Advances in Neural Information Processing Systems, 33, 17022\u201317033.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"833_CR19","unstructured":"Kuleshov, V., Enam, S. Z., & Ermon, S. (2017). Audio super resolution using neural networks. Preprint retrieved from http:\/\/arxiv.org\/abs\/1708.00853"},{"key":"833_CR20","doi-asserted-by":"crossref","unstructured":"Li, K., & Lee, C.-H. (2015). A deep neural network approach to speech bandwidth expansion. In 2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (pp. 4395\u20134399). IEEE.","DOI":"10.1109\/ICASSP.2015.7178801"},{"key":"833_CR21","doi-asserted-by":"crossref","unstructured":"Liu, Y. (2021). Recovery of lossy compressed music based on CNN super-resolution and GAN. In 2021 IEEE 3rd International Conference on Frontiers Technology of Information and Computer (ICFTIC) (pp. 623\u2013629). IEEE.","DOI":"10.1109\/ICFTIC54370.2021.9647041"},{"issue":"18","key":"833_CR22","doi-asserted-by":"publisher","first-page":"28365","DOI":"10.1007\/s11042-021-11080-y","volume":"80","author":"S Liu","year":"2021","unstructured":"Liu, S., Keren, G., Parada-Cabaleiro, E., & Schuller, B. (2021). N-HANS: A neural network-based toolkit for in-the-wild audio enhancement. Multimedia Tools and Applications, 80(18), 28365\u201328389.","journal-title":"Multimedia Tools and Applications"},{"key":"833_CR23","unstructured":"Loshchilov, I., & Hutter, F. (2017). Decoupled weight decay regularization. Preprint retrieved from http:\/\/arxiv.org\/abs\/1711.05101"},{"key":"833_CR24","doi-asserted-by":"crossref","unstructured":"Mandel, M., Tal, O., & Adi, Y. (2023). Aero: Audio super resolution in the spectral domain. In ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (pp. 1\u20135). IEEE.","DOI":"10.1109\/ICASSP49357.2023.10095382"},{"key":"833_CR25","doi-asserted-by":"crossref","unstructured":"McFee, B., Raffel, C., Liang, D., Ellis, D. P., McVicar, M., Battenberg, E., & Nieto, O. (2015). librosa: Audio and music signal analysis in python. In Kathryn\u00a0Huff, J. B. (Ed.), Proceedings of the 14th Python in Science Conference, (Vol. 8, pp. 18\u201325). Citeseer.","DOI":"10.25080\/Majora-7b98e3ed-003"},{"issue":"1","key":"833_CR26","first-page":"1049","volume":"45","author":"S McKinley","year":"1998","unstructured":"McKinley, S., & Levine, M. (1998). Cubic spline interpolation. College of the Redwoods, 45(1), 1049\u20131060.","journal-title":"College of the Redwoods"},{"key":"833_CR27","doi-asserted-by":"crossref","unstructured":"Nistic\u00f2, S., Palopoli, L., & Romano, A. P. (2022). Audio super-resolution via vision transformer. In International Symposium on Methodologies for Intelligent Systems (pp. 378\u2013387). Springer.","DOI":"10.1007\/978-3-031-16564-1_36"},{"key":"833_CR28","doi-asserted-by":"crossref","unstructured":"Nogales, A., Donaher, S., & Garc\u00eda-Tejedor, \u00c1. (2023). A deep learning framework for audio restoration using convolutional\/deconvolutional deep autoencoders. Expert Systems with Applications, 120586.","DOI":"10.1016\/j.eswa.2023.120586"},{"key":"833_CR29","doi-asserted-by":"crossref","unstructured":"Oyedotun, O. K., Al\u00a0Ismaeil, K., & Aouada, D. (2022). Why is everyone training very deep neural network with skip connections? IEEE Transactions on Neural Networks and Learning Systems.","DOI":"10.1109\/ICPR48806.2021.9412508"},{"key":"833_CR30","doi-asserted-by":"crossref","unstructured":"Podder, P., Khan, T. Z., Khan, M. H., & Rahman, M. M. (2014). Comparative performance analysis of hamming, hanning and blackman window. International Journal of Computer Applications,96(18).","DOI":"10.5120\/16891-6927"},{"issue":"2","key":"833_CR31","doi-asserted-by":"publisher","first-page":"45","DOI":"10.5815\/ijisa.2016.02.06","volume":"8","author":"N Prasad","year":"2016","unstructured":"Prasad, N., & Kumar, T. K. (2016). Bandwidth extension of speech signals: A comprehensive review. International Journal of Intelligent Systems and Applications, 8(2), 45\u201352.","journal-title":"International Journal of Intelligent Systems and Applications"},{"key":"833_CR32","doi-asserted-by":"crossref","unstructured":"Rethage, D., Pons, J., & Serra, X. (2018). A wavenet for speech denoising. In 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (pp. 5069\u20135073). IEEE.","DOI":"10.1109\/ICASSP.2018.8462417"},{"issue":"1","key":"833_CR33","doi-asserted-by":"publisher","first-page":"10","DOI":"10.1109\/jrproc.1949.232969","volume":"37","author":"CE Shannon","year":"1949","unstructured":"Shannon, C. E. (1949). Communication in the presence of noise. Proceedings of the IRE, 37(1), 10\u201321. https:\/\/doi.org\/10.1109\/jrproc.1949.232969","journal-title":"Proceedings of the IRE"},{"key":"833_CR34","doi-asserted-by":"crossref","unstructured":"Smaragdis, P., & Raj, B. (2007). Example-driven bandwidth expansion. In 2007 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (pp. 135\u2013138). IEEE.","DOI":"10.1109\/ASPAA.2007.4393004"},{"key":"833_CR35","doi-asserted-by":"crossref","unstructured":"Su, J., Wang, Y., Finkelstein, A., & Jin, Z. (2021). Bandwidth extension is all you need. In ICASSP 2021-2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (pp. 696\u2013700). IEEE.","DOI":"10.1109\/ICASSP39728.2021.9413575"},{"key":"833_CR36","doi-asserted-by":"crossref","unstructured":"Su, J., Wang, Y., Finkelstein, A., & Jin, Z. (2021). Bandwidth extension is all you need. In ICASSP 2021-2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (pp. 696\u2013700). IEEE.","DOI":"10.1109\/ICASSP39728.2021.9413575"},{"key":"833_CR37","doi-asserted-by":"crossref","unstructured":"Wang, H., & Wang, D. (2020). Time-frequency loss for CNN based speech super-resolution. In ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (pp. 861\u2013865). IEEE.","DOI":"10.1109\/ICASSP40776.2020.9053712"},{"key":"833_CR38","doi-asserted-by":"crossref","unstructured":"Wang, H., & Wang, D. (2021). Towards robust speech super-resolution. IEEE\/ACM Transactions on Audio, Speech, and Language Processing,29, 2058\u20132066.","DOI":"10.1109\/TASLP.2021.3054302"},{"issue":"10","key":"833_CR39","doi-asserted-by":"publisher","first-page":"1702","DOI":"10.1109\/TASLP.2018.2842159","volume":"26","author":"D Wang","year":"2018","unstructured":"Wang, D., & Chen, J. (2018). Supervised speech separation based on deep learning: An overview. IEEE\/ACM Transactions on Audio, Speech, and Language Processing, 26(10), 1702\u20131726.","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"issue":"1","key":"833_CR40","doi-asserted-by":"publisher","first-page":"25","DOI":"10.1109\/TASE.2007.911680","volume":"5","author":"J-C Wang","year":"2008","unstructured":"Wang, J.-C., Lee, H.-P., Wang, J.-F., & Lin, C.-B. (2008). Robust environmental sound recognition for home automation. IEEE Transactions on Automation Science and Engineering, 5(1), 25\u201331.","journal-title":"IEEE Transactions on Automation Science and Engineering"},{"key":"833_CR41","doi-asserted-by":"crossref","unstructured":"Westhausen, N. L., & Meyer, B. T. (2020). Dual-signal transformation LSTM network for real-time noise suppression.","DOI":"10.21437\/Interspeech.2020-2631"},{"key":"833_CR42","doi-asserted-by":"crossref","unstructured":"Yamamoto, R., Song, E., & Kim, J.-M. (2020) Parallel wavegan: A fast waveform generation model based on generative adversarial networks with multi-resolution spectrogram. In ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (pp. 6199\u20136203). IEEE.","DOI":"10.1109\/ICASSP40776.2020.9053795"}],"container-title":["Journal of Intelligent Information Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10844-023-00833-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10844-023-00833-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10844-023-00833-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,5]],"date-time":"2024-09-05T08:10:44Z","timestamp":1725523844000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10844-023-00833-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12,12]]},"references-count":42,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2024,8]]}},"alternative-id":["833"],"URL":"https:\/\/doi.org\/10.1007\/s10844-023-00833-w","relation":{},"ISSN":["0925-9902","1573-7675"],"issn-type":[{"type":"print","value":"0925-9902"},{"type":"electronic","value":"1573-7675"}],"subject":[],"published":{"date-parts":[[2023,12,12]]},"assertion":[{"value":"13 July 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 November 2023","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 November 2023","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 December 2023","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"Not Applicable.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics approval"}},{"value":"Not Applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent to participate"}},{"value":"Not Applicable.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent for publication"}},{"value":"The authors declare no competing interests.","order":5,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}]}}