{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T18:29:02Z","timestamp":1775068142051,"version":"3.50.1"},"reference-count":41,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2023,12,1]],"date-time":"2023-12-01T00:00:00Z","timestamp":1701388800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,12,1]],"date-time":"2023-12-01T00:00:00Z","timestamp":1701388800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Speech Technol"],"published-print":{"date-parts":[[2023,12]]},"DOI":"10.1007\/s10772-023-10071-8","type":"journal-article","created":{"date-parts":[[2023,12,13]],"date-time":"2023-12-13T03:02:14Z","timestamp":1702436534000},"page":"1017-1030","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":12,"title":["SENet-based speech emotion recognition using synthesis-style transfer data augmentation"],"prefix":"10.1007","volume":"26","author":[{"given":"Rajeev","family":"Rajan","sequence":"first","affiliation":[]},{"given":"T. V.","family":"Hridya Raj","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,12,13]]},"reference":[{"key":"10071_CR1","doi-asserted-by":"crossref","unstructured":"Ali-Gombe, A., & MFC-GAN EE. (2019). Class-imbalanced dataset classification using multiple fake class generative adversarial network. Neurocomputing, 361, 212\u2013221.","DOI":"10.1016\/j.neucom.2019.06.043"},{"key":"10071_CR2","doi-asserted-by":"crossref","unstructured":"Bao, F., Neumann, M., & Vu, T. (2019). Cyclegan-based emotion style transfer as data augmentation for speech emotion recognition. In InterSpeech (pp. 2828\u20132832).","DOI":"10.21437\/Interspeech.2019-2293"},{"key":"10071_CR3","doi-asserted-by":"crossref","unstructured":"Chatziagapi, A., Paraskevopoulos, G., Sgouropoulos, D., Pantazopoulos, G., Nikandrou, M., Giannakopoulos, T., Katsamanis, A., Potamianos, A., & Narayanan, S. (2019) Data augmentation using gans for speech emotion recognition. In Interspeech (pp. 171\u2013175).","DOI":"10.21437\/Interspeech.2019-2561"},{"key":"10071_CR4","doi-asserted-by":"publisher","unstructured":"Dey, S., Rajan, R., Padmanabhan, R., & Murthy, H. A. (2011). Feature diversity for emotion, language and speaker verification. In 2011 national conference on communications (NCC) (pp. 1\u20135). Bangalore, India. https:\/\/doi.org\/10.1109\/NCC.2011.5734774.","DOI":"10.1109\/NCC.2011.5734774."},{"key":"10071_CR5","unstructured":"Donahue, C., McAuley, J. J., & Puckette, M. (2019). Adversarial audio synthesis. In Proceedings of international conference on learning representations (ICLR) (pp. 1\u201316)."},{"key":"10071_CR6","doi-asserted-by":"crossref","unstructured":"Drisya, P. S., & Rajan, R. (2017). Significance of teo slope feature in speech emotion recognition. In 2017 international conference on networks & advances in computational technologies (NetACT) (pp. 438\u2013441). Thiruvananthapuram, India.","DOI":"10.1109\/NETACT.2017.8076811"},{"key":"10071_CR7","doi-asserted-by":"crossref","unstructured":"Gatys, L., Ecker, A., & Bethge, M. (2016) Image style transfer using convolutional neural networks. In IEEE conference on computer vision and pattern recognition (CVPR) (pp. 2414\u20132423).","DOI":"10.1109\/CVPR.2016.265"},{"key":"10071_CR8","doi-asserted-by":"publisher","unstructured":"Ghosal, D., & Kolekar, M. (2018). Music genre recognition using deep neural networks and transfer learning. 2087\u20132091. https:\/\/doi.org\/10.21437\/Interspeech.2018-2045.","DOI":"10.21437\/Interspeech.2018-2045"},{"key":"10071_CR9","unstructured":"Humphrey, E. J., Bello, J. P., & LeCun, Y. (2012). Moving beyond feature design: Deep architectures and automatic feature learning in music informatics. In Proceedings of international society for music information retrieval conference (ISMIR) (pp. 403\u2013408)."},{"key":"10071_CR10","doi-asserted-by":"publisher","first-page":"268","DOI":"10.1016\/j.specom.2008.09.006","volume":"51","author":"Z Inanoglu","year":"2009","unstructured":"Inanoglu, Z., & Young, S. (2009). Data-driven emotion conversion in spoken English. Speech Communication, 51, 268\u2013283.","journal-title":"Speech Communication"},{"key":"10071_CR11","unstructured":"Jaitley, N., & Hinton, G. E. (2013). Vocal tract length perturbation (VTLP) improves speech recognition. In Proceedings of ICML workshop on deep learning for audio, speech, and language (pp. 278\u2013324)."},{"key":"10071_CR12","unstructured":"Jia, Y., Zhang, Y., Weiss, R. J., Wang, Q., Shen, J., Ren, F., Chen, Z., Nguyen, P., Pang, R., Lopez-Moreno, I., & Wu, Y. (2018). Transfer learning from speaker verification to multispeaker text-to-speech synthesis. In Proceedings of neural information processing systems (pp. 1\u201315)."},{"key":"10071_CR13","doi-asserted-by":"crossref","unstructured":"Ko, T., et al. (2015). Audio augmentation for speech recognition. In Sixteenth annual conference of the international speech communication association.","DOI":"10.21437\/Interspeech.2015-711"},{"key":"10071_CR14","doi-asserted-by":"crossref","unstructured":"Ko, T., Peddinti, V., Povey, D., Seltzer, M. L., & Khudanpur, S. (2017). A study on data augmentation of reverberant speech for robust speech recognition. In IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp. 5220\u20135224).","DOI":"10.1109\/ICASSP.2017.7953152"},{"issue":"11","key":"10071_CR15","doi-asserted-by":"publisher","first-page":"2278","DOI":"10.1109\/5.726791","volume":"86","author":"Y Lecun","year":"1998","unstructured":"Lecun, Y., Bottou, L., Bengio, Y., & Haffner, P. (1998). Gradient-based learning applied to document recognition. Proceedings of the IEEE, 86(11), 2278\u20132324.","journal-title":"Proceedings of the IEEE"},{"key":"10071_CR16","doi-asserted-by":"crossref","unstructured":"Li, T. L. H., & Chan, A. B. (2011). Genre classification and the invariance of MFCC features to key and tempo. In Lecture notes in computer science (Vol. 6523 LNCS, pp. 317\u2013327).","DOI":"10.1007\/978-3-642-17832-0_30"},{"key":"10071_CR17","doi-asserted-by":"crossref","unstructured":"Liao, Z., & Shen, S. (2023). Speech emotion recognition based on swin-transformer. Journal of Physics: Conference Series 2508(1), 012056.","DOI":"10.1088\/1742-6596\/2508\/1\/012056"},{"key":"10071_CR18","doi-asserted-by":"publisher","first-page":"1010","DOI":"10.1016\/j.csl.2019.101027","volume":"60","author":"A Nagrani","year":"2020","unstructured":"Nagrani, A., Chung, J. S., Xie, W., & Zisserman, A. (2020). Voxceleb: Large-scale speaker verification in the wild. Computer Speech Language, 60, 1010\u201327.","journal-title":"Computer Speech Language"},{"key":"10071_CR19","doi-asserted-by":"publisher","first-page":"101084","DOI":"10.1016\/j.ecoinf.2020.101084","volume":"57","author":"L Nanni","year":"2020","unstructured":"Nanni, L., Maguolo, G., & Paci, M. (2020). Data augmentation approaches for improving animal audio classification. Ecological Informatics, 57, 101084.","journal-title":"Ecological Informatics"},{"key":"10071_CR20","doi-asserted-by":"publisher","first-page":"654","DOI":"10.1121\/1.5087827","volume":"145","author":"T Oikarinen","year":"2019","unstructured":"Oikarinen, T., Srinivasan, K., Meisner, O., Hyman, J. B., Parmar, S., Fanucci-Kiss, A., Desimone, R., Landman, R., & Feng, G. (2019). Deep convolutional network for animal sound classification and source attribution using dual audio recordings. Journal of the Acoustical Society of America, 145, 654\u2013662.","journal-title":"Journal of the Acoustical Society of America"},{"key":"10071_CR21","doi-asserted-by":"crossref","unstructured":"Padi, S., Sadjadi, S. O., & Manocha, D. (2021). Improved speech emotion recognition using transfer learning and spectrogram augmentation. In Proceedings of the 2021 international conference on multimodal interaction (ICMI).","DOI":"10.1145\/3462244.3481003"},{"key":"10071_CR22","doi-asserted-by":"crossref","unstructured":"Panayotov, V., Chen, G., Povey, D., & Khudanpur, S. (2015). Librispeech: An asrcorpus based on public domain audio books. In Proceedings of of IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp. 5206\u20135210).","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"10071_CR23","doi-asserted-by":"publisher","unstructured":"Paraskevopoulou, G., Spyrou, E., & Perantonis, S. A. (2022). Data augmentation approach for improving the performance of speech emotion recognition. In Proceedings of the 19th international conference on signal processing and multimedia applications (ICSPMA) (pp. 61\u201369). https:\/\/doi.org\/10.5220\/0011148000003289","DOI":"10.5220\/0011148000003289"},{"key":"10071_CR24","doi-asserted-by":"crossref","unstructured":"Park, D. S., et al. (2019).Specaugment: A simple data augmentation method for automatic speech recognition, arXiv preprint arXiv:1904.08779 .","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"10071_CR25","doi-asserted-by":"crossref","unstructured":"Peng, Z., Lu, Y., Pan, S., Liu, Y. (2021). Efficient speech emotion recognition using multi-scale CNN and attention. In ICASSP 2021-2021 IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp. 3020\u20133024).","DOI":"10.1109\/ICASSP39728.2021.9414286"},{"key":"10071_CR26","doi-asserted-by":"publisher","first-page":"307","DOI":"10.1007\/s00034-022-02122-3","volume":"42","author":"S Resna","year":"2023","unstructured":"Resna, S., & Rajan, R. (2023a). Multi-voice singing synthesis from lyrics. Circuits System Signal Processing, 42, 307\u2013321. https:\/\/doi.org\/10.1007\/s00034-022-02122-3","journal-title":"Circuits System Signal Processing"},{"issue":"1","key":"10071_CR27","doi-asserted-by":"publisher","first-page":"2417","DOI":"10.5875\/ausmt.v13i1.2417","volume":"13","author":"S Resna","year":"2023","unstructured":"Resna, S., & Rajan, R. (2023b). Comparative study on multi-voice singing synthesize systems. International Journal of Automation and Smart Technology, 13(1), 2417.","journal-title":"International Journal of Automation and Smart Technology"},{"key":"10071_CR28","unstructured":"Schluter, J., & Grill, T. (2015). Exploring data augmentation for improved singing voice detection with neural networks. In Proceedings of the 16th international society for music information retrieval conference (ISMIR) (pp. 121\u2013126)."},{"key":"10071_CR29","doi-asserted-by":"crossref","unstructured":"Su, B.-H., & Lee, C. C. (2022). Unsupervised cross-corpus speech emotion recognition using a multi-source cycle-GAN. IEEE Transactions on Affective Computing.","DOI":"10.1109\/TAFFC.2022.3146325"},{"key":"10071_CR30","unstructured":"Su, B., & Lee, C. (2018). Unsupervised cross-corpus speech emotion recognition using a multi-source cycle-GAN. IEEE Transactions on Affective Computing, no. 01, pp. 1\u20131, 5555."},{"key":"10071_CR31","doi-asserted-by":"publisher","unstructured":"Subbarao, M. V., Terlapu, S. K., & Chowdary, P. S. R. (2022). Emotion recognition using BiLSTM classifier. In 2022 international conference on computing, communication and power technology (IC3P) (pp. 195\u2013198). Visakhapatnam, India. https:\/\/doi.org\/10.1109\/IC3P52835.2022.00048","DOI":"10.1109\/IC3P52835.2022.00048"},{"key":"10071_CR32","unstructured":"Sukhavasi, M., & Sainath, A. (2019). Music theme recognition using CNN and self-attention. ArXiv: abs\/1911.07041"},{"issue":"4","key":"10071_CR33","doi-asserted-by":"publisher","first-page":"1145","DOI":"10.1109\/TASL.2006.876113","volume":"14","author":"J Tao","year":"2006","unstructured":"Tao, J., Kang, Y., & Li, A. (2006). Prosody conversion from neutral speech to emotional speech. IEEE Transactions on Audio, Speech, and Language Processing, 14(4), 1145\u20131154.","journal-title":"IEEE Transactions on Audio, Speech, and Language Processing"},{"issue":"5","key":"10071_CR34","doi-asserted-by":"publisher","first-page":"699","DOI":"10.1109\/TAI.2022.3149234","volume":"3","author":"G Tu","year":"2022","unstructured":"Tu, G., Wen, J., Liu, C., Jiang, D., & Cambria, E. (2022). Context- and sentiment-aware networks for emotion recognition in conversation. IEEE Transactions on Artificial Intelligence, 3(5), 699\u2013708.","journal-title":"IEEE Transactions on Artificial Intelligence"},{"key":"10071_CR35","doi-asserted-by":"publisher","first-page":"6212","DOI":"10.3390\/s23136212","volume":"23","author":"R Ullah","year":"2023","unstructured":"Ullah, R., Asif, M., Shah, W. A., Anjam, F., Ullah, I., Khurshaid, T., Wuttisittikulkij, L., Shah, S., Ali, S. M., & Alibakhshikenari, M. (2023). Speech emotion recognition using convolution neural networks and multi-head convolutional transformer. Sensors, 23, 6212. https:\/\/doi.org\/10.3390\/s23136212","journal-title":"Sensors"},{"key":"10071_CR36","unstructured":"Ulyanov, D., & Lebedev, V. (2016). Audio texture synthesis and style transfer. http:\/\/tinyurl.com\/y844x8qt"},{"key":"10071_CR37","doi-asserted-by":"crossref","unstructured":"Wagner, J., Triantafyllopoulos, A., Wierstorf, H., Schmitt, M., Eyben, F., & Schuller, B. (2022). Dawn of the transformer era in speech emotion recognition: Closing the valence gap. arXiv preprint arXiv:2203.07378,","DOI":"10.1109\/TPAMI.2023.3263585"},{"key":"10071_CR38","doi-asserted-by":"crossref","unstructured":"Wagner, J., Triantafyllopoulos, A., Wierstorf, H., Schmitt, M., Burkhardt, F., Eyben, F., & Schuller, B. W. (2022) Dawn of the transformer era in speech emotion recognition, closing the valence gap. In arXiv preprint arXiv:2203.07378, .","DOI":"10.1109\/TPAMI.2023.3263585"},{"key":"10071_CR39","doi-asserted-by":"crossref","unstructured":"Wong, S. C., Gatt,V. S. A., & McDonnell, M. D.(2016). Understanding data augmentation for classification: When to warp? In 2016 international conference on digital image computing: Techniques and applications (DICTA) (pp. 3586\u20133589).","DOI":"10.1109\/DICTA.2016.7797091"},{"key":"10071_CR40","doi-asserted-by":"publisher","first-page":"358","DOI":"10.1016\/j.patrec.2020.11.009","volume":"140","author":"Z Zhu","year":"2020","unstructured":"Zhu, Z., Dai, W., Hu, Y., & Li, J. (2020). Speech emotion recognition model based on Bi-GRU and focal loss. Pattern Recognition Letters, 140, 358\u2013365. https:\/\/doi.org\/10.1016\/j.patrec.2020.11.009","journal-title":"Pattern Recognition Letters"},{"key":"10071_CR41","doi-asserted-by":"crossref","unstructured":"Zhu, J.-Y., Park, T., Isola, P., & Efros, A. (2017). Unpaired image-to-image translation using cycle-consistent adversarial networks. In Proceedings of the IEEE international conference on computer vision (ICCV) (pp. 2242\u20132251).","DOI":"10.1109\/ICCV.2017.244"}],"container-title":["International Journal of Speech Technology"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-023-10071-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10772-023-10071-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-023-10071-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,11]],"date-time":"2024-01-11T10:15:18Z","timestamp":1704968118000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10772-023-10071-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12]]},"references-count":41,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2023,12]]}},"alternative-id":["10071"],"URL":"https:\/\/doi.org\/10.1007\/s10772-023-10071-8","relation":{},"ISSN":["1381-2416","1572-8110"],"issn-type":[{"value":"1381-2416","type":"print"},{"value":"1572-8110","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,12]]},"assertion":[{"value":"21 August 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 November 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 December 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that there is no competing interest related to this manuscript.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}