{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T00:42:49Z","timestamp":1742949769924,"version":"3.40.3"},"publisher-location":"Cham","reference-count":25,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783030757649"},{"type":"electronic","value":"9783030757656"}],"license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021]]},"DOI":"10.1007\/978-3-030-75765-6_14","type":"book-chapter","created":{"date-parts":[[2021,5,7]],"date-time":"2021-05-07T09:08:54Z","timestamp":1620378534000},"page":"168-180","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Progressive AutoSpeech: An Efficient and\u00a0General Framework for Automatic Speech Classification"],"prefix":"10.1007","author":[{"given":"Guanghui","family":"Zhu","sequence":"first","affiliation":[]},{"given":"Feng","family":"Cheng","sequence":"additional","affiliation":[]},{"given":"Mengchuan","family":"Qiu","sequence":"additional","affiliation":[]},{"given":"Zhuoer","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Wenjie","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Chunfeng","family":"Yuan","sequence":"additional","affiliation":[]},{"given":"Yihua","family":"Huang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,5,8]]},"reference":[{"key":"14_CR1","doi-asserted-by":"crossref","unstructured":"Adavanne, S., Drossos, K., \u00c7akir, E., Virtanen, T.: Stacked convolutional and recurrent neural networks for bird audio detection. In: Proceedings of the European Signal Processing Conference (EUSIPCO), pp. 1729\u20131733 (2017)","DOI":"10.23919\/EUSIPCO.2017.8081505"},{"key":"14_CR2","unstructured":"Adavanne, S., Virtanen, T.: Sound event detection using weakly labeled dataset with stacked convolutional and recurrent neural network. arXiv preprint arXiv:1710.02998 (2017)"},{"issue":"1","key":"14_CR3","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s10994-017-5692-y","volume":"107","author":"P Brazdil","year":"2018","unstructured":"Brazdil, P., Giraud-Carrier, C.: Metalearning and algorithm selection: progress, state of the art and introduction to the 2018 special issue. Mach. Learn. 107(1), 1\u201314 (2018)","journal-title":"Mach. Learn."},{"key":"14_CR4","doi-asserted-by":"crossref","unstructured":"Carmi, N., Cohen, A., Avigal, M., Lerner, A.: A storyteller\u2019s tale: literature audiobooks genre classification using CNN and RNN architectures. In: Proceedings of Interspeech 2019, pp. 3387\u20133390 (2019)","DOI":"10.21437\/Interspeech.2019-1154"},{"key":"14_CR5","doi-asserted-by":"crossref","unstructured":"Dai, W., Dai, C., Qu, S., Li, J., Das, S.: Very deep convolutional neural networks for raw waveforms. In: Proceedings of the International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 421\u2013425 (2017)","DOI":"10.1109\/ICASSP.2017.7952190"},{"key":"14_CR6","unstructured":"Ellis, D.P.W.: Classifying music audio with timbral and chroma features. In: Proceedings of the International Conference on Music Information Retrieval, pp. 339\u2013340 (2007)"},{"key":"14_CR7","unstructured":"Ganchev, T., Fakotakis, N., Kokkinakis, G.: Comparative evaluation of various mfcc implementations on the speaker verification task. In: Proceedings of the International Conference on Speech and Computer, pp. 191\u2013194 (2005)"},{"key":"14_CR8","series-title":"The Springer Series on Challenges in Machine Learning","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-05318-5","volume-title":"Automated Machine Learning","year":"2019","unstructured":"Hutter, F., Kotthoff, L., Vanschoren, J. (eds.): Automated Machine Learning. TSSCML. Springer, Cham (2019). https:\/\/doi.org\/10.1007\/978-3-030-05318-5"},{"key":"14_CR9","unstructured":"Irvin, J., Chartock, E., Hollander, N.: Recurrent neural networks with attention for genre classification (2016)"},{"issue":"2","key":"14_CR10","doi-asserted-by":"publisher","first-page":"285","DOI":"10.1109\/JSTSP.2019.2909479","volume":"13","author":"T Kim","year":"2019","unstructured":"Kim, T., Lee, J., Nam, J.: Comparison and analysis of sample cnn architectures for audio classification. IEEE J. Sel. Topics Signal Process. 13(2), 285\u2013297 (2019)","journal-title":"IEEE J. Sel. Topics Signal Process."},{"key":"14_CR11","unstructured":"Lin, Y.L., Wei, G.: Speech emotion recognition based on HMM and SVM. In: Proceedings of the International Conference on Machine Learning and Cybernetics, pp. 4898\u20134901 (2005)"},{"key":"14_CR12","doi-asserted-by":"crossref","unstructured":"Liu, C., Wang, Y., Kumar, K., Gong, Y.: Investigations on speaker adaptation of LSTM RNN models for speech recognition. In: Proceedings of the International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5020\u20135024 (2016)","DOI":"10.1109\/ICASSP.2016.7472633"},{"key":"14_CR13","unstructured":"Liu, Z., et al.: Autocv challenge design and baseline results. In: CAp 2019 - Conf\u00e9rence sur l\u2019Apprentissage Automatique. Toulouse, France (2019)"},{"key":"14_CR14","doi-asserted-by":"crossref","unstructured":"Majumder, N., Poria, S., Hazarika, D., Mihalcea, R., Gelbukh, A., Cambria, E.: Dialoguernn: an attentive RNN for emotion detection in conversations. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 33, pp. 6818\u20136825 (2019)","DOI":"10.1609\/aaai.v33i01.33016818"},{"key":"14_CR15","doi-asserted-by":"crossref","unstructured":"Malik, M., Adavanne, S., Drossos, K., Virtanen, T., Ticha, D., Jarina, R.: Stacked convolutional and recurrent neural networks for music emotion recognition. arXiv preprint arXiv:1706.02292 (2017)","DOI":"10.23919\/EUSIPCO.2017.8081505"},{"key":"14_CR16","doi-asserted-by":"crossref","unstructured":"Nagrani, A., Chung, J.S., Zisserman, A.: Voxceleb: a large-scale speaker identification dataset. arXiv preprint arXiv:1706.08612 (2017)","DOI":"10.21437\/Interspeech.2017-950"},{"issue":"4","key":"14_CR17","doi-asserted-by":"publisher","first-page":"1085","DOI":"10.1109\/TASL.2011.2172422","volume":"20","author":"S Nakagawa","year":"2011","unstructured":"Nakagawa, S., Wang, L., Ohtsuka, S.: Speaker identification and verification by combining MFCC and phase information. IEEE Trans. Audio Speech Lang. Process. 20(4), 1085\u20131095 (2011)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"14_CR18","doi-asserted-by":"crossref","unstructured":"Padi, B., Mohan, A., Ganapathy, S.: Attention based hybrid i-vector BLSTM model for language recognition. In: Proceedings of Interspeech 2019, pp. 1263\u20131267 (2019)","DOI":"10.21437\/Interspeech.2019-2371"},{"key":"14_CR19","doi-asserted-by":"crossref","unstructured":"Panayotov, V., Chen, G., Povey, D., Khudanpur, S.: Librispeech: an ASR corpus based on public domain audio books. In: Proceedings of the International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5206\u20135210 (2015)","DOI":"10.1109\/ICASSP.2015.7178964"},{"issue":"3","key":"14_CR20","doi-asserted-by":"publisher","first-page":"45","DOI":"10.1109\/MCAS.2016.2583681","volume":"16","author":"M Parchami","year":"2016","unstructured":"Parchami, M., Zhu, W.P., Champagne, B., Plourde, E.: Recent developments in speech enhancement in the short-time fourier transform domain. IEEE Circ. Syst. Mag. 16(3), 45\u201377 (2016)","journal-title":"IEEE Circ. Syst. Mag."},{"key":"14_CR21","doi-asserted-by":"crossref","unstructured":"Park, K., Mulc, T.: Css10: a collection of single speaker speech datasets for 10 languages. arXiv preprint arXiv:1903.11269 (2019)","DOI":"10.21437\/Interspeech.2019-1500"},{"issue":"1\u20133","key":"14_CR22","doi-asserted-by":"publisher","first-page":"19","DOI":"10.1006\/dspr.1999.0361","volume":"10","author":"DA Reynolds","year":"2000","unstructured":"Reynolds, D.A., Quatieri, T.F., Dunn, R.B.: Speaker verification using adapted gaussian mixture models. Dig. Signal Process. 10(1\u20133), 19\u201341 (2000)","journal-title":"Dig. Signal Process."},{"key":"14_CR23","doi-asserted-by":"crossref","unstructured":"Shen, J., et al.: Natural TTS synthesis by conditioning wavenet on mel spectrogram predictions. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4779\u20134783. IEEE (2018)","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"14_CR24","doi-asserted-by":"crossref","unstructured":"Wang, J., et al.: Autospeech 2020: the second automated machine learning challenge for speech classification. In: Interspeech 2020, pp. 1967\u20131971 (2020)","DOI":"10.21437\/Interspeech.2020-1986"},{"key":"14_CR25","doi-asserted-by":"crossref","unstructured":"Xie, W., Nagrani, A., Chung, J.S., Zisserman, A.: Utterance-level aggregation for speaker recognition in the wild. In: Proceedings of the International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5791\u20135795 (2019)","DOI":"10.1109\/ICASSP.2019.8683120"}],"container-title":["Lecture Notes in Computer Science","Advances in Knowledge Discovery and Data Mining"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-75765-6_14","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,5,10]],"date-time":"2021-05-10T23:57:31Z","timestamp":1620691051000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-75765-6_14"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"ISBN":["9783030757649","9783030757656"],"references-count":25,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-75765-6_14","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2021]]},"assertion":[{"value":"8 May 2021","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PAKDD","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Pacific-Asia Conference on Knowledge Discovery and Data Mining","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2021","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 May 2021","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 May 2021","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"25","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"pakdd2021","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/pakdd2021.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Single-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"673","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"157","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"23% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"7","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}