{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,22]],"date-time":"2025-10-22T18:20:35Z","timestamp":1761157235690},"reference-count":20,"publisher":"Institute of Electronics, Information and Communications Engineers (IEICE)","issue":"10","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEICE Trans. Inf. &amp; Syst."],"published-print":{"date-parts":[[2021,10,1]]},"DOI":"10.1587\/transinf.2021edl8029","type":"journal-article","created":{"date-parts":[[2021,9,30]],"date-time":"2021-09-30T22:41:20Z","timestamp":1633041680000},"page":"1762-1765","source":"Crossref","is-referenced-by-count":4,"title":["Multi-Task Learning for Improved Recognition of Multiple Types of Acoustic Information"],"prefix":"10.1587","volume":"E104.D","author":[{"given":"Jae-Won","family":"KIM","sequence":"first","affiliation":[{"name":"Department of Electronics Engineering, Kwangwoon University"}]},{"given":"Hochong","family":"PARK","sequence":"additional","affiliation":[{"name":"Department of Electronics Engineering, Kwangwoon University"}]}],"member":"532","reference":[{"key":"1","doi-asserted-by":"publisher","unstructured":"[1] R. Caruana, \u201cMultitask learning,\u201d Machine Learning, vol.28, no.1, pp.41-75, 1997. 10.1023\/a:1007379606734","DOI":"10.1023\/A:1007379606734"},{"key":"2","unstructured":"[2] S. Ruder, \u201cAn overview of multi-task learning in deep neural networks,\u201d arXiv preprint arXiv:1706.05098, 2017."},{"key":"3","unstructured":"[3] Y. Zhang and Q. Yang, \u201cA survey on multi-task learning,\u201d arXiv preprint arXiv:1707.08114, 2018."},{"key":"4","doi-asserted-by":"publisher","unstructured":"[4] E.C. Smith and M.S. Lewicki, \u201cEfficient auditory coding,\u201d Nature, vol.439, pp.978-982, 2006. 10.1038\/nature04485","DOI":"10.1038\/nature04485"},{"key":"5","doi-asserted-by":"crossref","unstructured":"[5] G. Tzanetakis and P. Cook, \u201cMusical genre classification of audio signals,\u201d IEEE Trans. Speech and Audio Processing, vol.10, no.5, pp.293-302, 2002. 10.1109\/tsa.2002.800560","DOI":"10.1109\/TSA.2002.800560"},{"key":"6","doi-asserted-by":"publisher","unstructured":"[6] M.E. Ayadi, M.S. Kamel, and F. Karray, \u201cSurvey on speech emotion recognition: features, classification schemes, and databases,\u201d Pattern Recognition, vol.44, no.3, pp.572-587, 2011. 10.1016\/j.patcog.2010.09.020","DOI":"10.1016\/j.patcog.2010.09.020"},{"key":"7","doi-asserted-by":"publisher","unstructured":"[7] O. Abdel-Hamid, A.-R. Mohamed, H. Jiang, L. Deng, G. Penn, and D. Yu, \u201cConvolutional neural networks for speech recognition,\u201d IEEE\/ACM Trans. Audio, Speech and Language Processing, vol.22, no.10, pp.1533-1545, 2014. 10.1109\/taslp.2014.2339736","DOI":"10.1109\/TASLP.2014.2339736"},{"key":"8","doi-asserted-by":"publisher","unstructured":"[8] S.-H. Shin, H.-W. Yun, W.-J. Jang, and H. Park, \u201cExtraction of acoustic features based on auditory spike code and its application to music genre classification,\u201d IET Signal Processing, vol.13, no.2, pp.230-234, 2019. 10.1049\/iet-spr.2018.5158","DOI":"10.1049\/iet-spr.2018.5158"},{"key":"9","doi-asserted-by":"crossref","unstructured":"[9] I. Misra, A. Shrivastava, A. Gupta, and M. Herbert, \u201cCross-stitch networks for multi-task learning,\u201d Proc. IEEE Conf. Computer Vision and Pattern Recognition, pp.3994-4003, 2016. 10.1109\/cvpr.2016.433","DOI":"10.1109\/CVPR.2016.433"},{"key":"10","doi-asserted-by":"publisher","unstructured":"[10] R. Ranjan, V.M. Patel, and R. Chellappa, \u201cHyperface: a deep multi-task learning framework for face detection, landmark localization, pose estimation, and gender recognition,\u201d IEEE Trans. Pattern Analysis and Machine Intelligence, vol.41, no.1, pp.121-135, 2019. 10.1109\/tpami.2017.2781233","DOI":"10.1109\/TPAMI.2017.2781233"},{"key":"11","doi-asserted-by":"crossref","unstructured":"[11] J.-T. Huang, J. Li, D. Yu, L. Deng, and Y. Gong, \u201cCross-language knowledge transfer using multilingual deep neural network with shared hidden layers,\u201d Proc. IEEE Int. Conf. Acoustics, Speech and Signal Proc., pp.7304-7308, 2013. 10.1109\/icassp.2013.6639081","DOI":"10.1109\/ICASSP.2013.6639081"},{"key":"12","doi-asserted-by":"crossref","unstructured":"[12] S. Kim, T. Hori, and S. Watanabe, \u201cCTC-attention based end-to-end speech recognition using multi-task learning,\u201d Proc. IEEE Int. Conf. Acoustics, Speech and Signal Process., pp.4835-4839, 2017.","DOI":"10.1109\/ICASSP.2017.7953075"},{"key":"13","doi-asserted-by":"crossref","unstructured":"[13] Z. Zhang, B. Wu, and B. Schuller, \u201cAttention-augmented end-to-end multi-task learning for emotion prediction from speech,\u201d Proc. IEEE Int. Conf. Acoustics, Speech and Signal Proc., pp.6705-6709, 2019. 10.1109\/icassp.2019.8682896","DOI":"10.1109\/ICASSP.2019.8682896"},{"key":"14","doi-asserted-by":"crossref","unstructured":"[14] H. Chen, Y. Zhang, and Q. Liu, \u201cNeural network for heterogeneous annotations,\u201d Proc. Conf. on Empirical Methods in Natural Language Processing, pp.731-741, 2016. 10.18653\/v1\/d16-1070","DOI":"10.18653\/v1\/D16-1070"},{"key":"15","doi-asserted-by":"crossref","unstructured":"[15] D.-J. Kim, J. Choi, T.-H. Oh, Y. Yoon, and I.S. Kweon, \u201cDisjoint multi-task learning between heterogeneous human-centric tasks,\u201d Proc. IEEE Winter Conf. on Applications of Computer Vision, pp.1699-1708. 2018. 10.1109\/wacv.2018.00189","DOI":"10.1109\/WACV.2018.00189"},{"key":"16","doi-asserted-by":"crossref","unstructured":"[16] Y. Hong, L. Niu, J. Zhang, and L. Zhang, \u201cBeyond without forgetting: multi-task learning for classification with disjoint datasets,\u201d Proc. IEEE Int. Conf. on Multimedia and Expo, 2020. 10.1109\/icme46284.2020.9102897","DOI":"10.1109\/ICME46284.2020.9102897"},{"key":"17","doi-asserted-by":"publisher","unstructured":"[17] J. Chorowski, R.J. Weiss, S. Bengio, and A. van den Oord, \u201cUnsupervised speech representation learning using WaveNet autoencoders,\u201d IEEE\/ACM Trans. Audio, Speech and Language Processing, vol.27, no.12, pp.2041-2053, 2019. 10.1109\/taslp.2019.2938863","DOI":"10.1109\/TASLP.2019.2938863"},{"key":"18","unstructured":"[18] J.-W. Kim and H. Park, \u201cPerformance enhancement of phoneme and emotion recognition by multi-task training of common neural network,\u201d J. Broadcast Eng., vol.25, no.5, pp.742-749, 2020."},{"key":"19","doi-asserted-by":"crossref","unstructured":"[19] V. Zue, S. Seneff, and J. Glass, \u201cSpeech database development at MIT: TIMIT and beyond,\u201d Speech Communication, vol.9, no.4, pp.351-356, 1990. 10.1016\/0167-6393(90)90010-7","DOI":"10.1016\/0167-6393(90)90010-7"},{"key":"20","doi-asserted-by":"publisher","unstructured":"[20] C. Busso, M. Bulut, C.-C. Lee, A. Kazemzadeh, E. Mower, S. Kim, J.N. Chang, S. Lee, and S.S. Narayanan, \u201cIEMOCAP: interactive emotional dyadic motion capture database,\u201d Language Resources and Evaluation, vol.42, no.4, pp.335-359, 2008. 10.1007\/s10579-008-9076-6","DOI":"10.1007\/s10579-008-9076-6"}],"container-title":["IEICE Transactions on Information and Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E104.D\/10\/E104.D_2021EDL8029\/_pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,10,2]],"date-time":"2021-10-02T06:48:46Z","timestamp":1633157326000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E104.D\/10\/E104.D_2021EDL8029\/_article"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,1]]},"references-count":20,"journal-issue":{"issue":"10","published-print":{"date-parts":[[2021]]}},"URL":"https:\/\/doi.org\/10.1587\/transinf.2021edl8029","relation":{},"ISSN":["0916-8532","1745-1361"],"issn-type":[{"value":"0916-8532","type":"print"},{"value":"1745-1361","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021,10,1]]},"article-number":"2021EDL8029"}}