{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:25:54Z","timestamp":1750220754835,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":32,"publisher":"ACM","license":[{"start":{"date-parts":[[2020,4,23]],"date-time":"2020-04-23T00:00:00Z","timestamp":1587600000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2020,4,23]]},"DOI":"10.1145\/3404555.3404571","type":"proceedings-article","created":{"date-parts":[[2020,8,20]],"date-time":"2020-08-20T17:01:06Z","timestamp":1597942866000},"page":"317-322","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Learning the Front-End Speech Feature with Raw Waveform for End-to-End Speaker Recognition"],"prefix":"10.1145","author":[{"given":"Ningxin","family":"Liang","sequence":"first","affiliation":[{"name":"School of Automation Science and Engineering, South China University of Technology, Guangzhou, China"}]},{"given":"Wei","family":"Xu","sequence":"additional","affiliation":[{"name":"School of Automation Science and Engineering, South China University of Technology, Guangzhou, China"}]},{"given":"Chengfang","family":"Luo","sequence":"additional","affiliation":[{"name":"School of Automation Science and Engineering, South China University of Technology, Guangzhou, China"}]},{"given":"Wenxiong","family":"Kang","sequence":"additional","affiliation":[{"name":"School of Automation Science and Engineering, South China University of Technology, Guangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2020,8,20]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"D. A. Reynolds T. F. Quatieri R. B. Dunn Speaker verification using adapted gaussian mixture models Digital process-ing 10 (2000) 19--41. 10.1006\/dspr.1999.0361  D. A. Reynolds T. F. Quatieri R. B. Dunn Speaker verification using adapted gaussian mixture models Digital process-ing 10 (2000) 19--41. 10.1006\/dspr.1999.0361","DOI":"10.1006\/dspr.1999.0361"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-620"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639622"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"K. Okabe T. Koshinaka K. Shinoda Attentive statistics pooling for deep speaker embedding arXiv preprint arXiv: 1803.10963 (2018).  K. Okabe T. Koshinaka K. Shinoda Attentive statistics pooling for deep speaker embedding arXiv preprint arXiv: 1803.10963 (2018).","DOI":"10.21437\/Interspeech.2018-993"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462025"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"W. Cai J. Chen M. Li Exploring the encoding layer and loss function in end-to-end speaker and language recognition system arXiv preprint arXiv:1804.05160 (2018).  W. Cai J. Chen M. Li Exploring the encoding layer and loss function in end-to-end speaker and language recognition system arXiv preprint arXiv:1804.05160 (2018).","DOI":"10.21437\/Odyssey.2018-11"},{"key":"e_1_3_2_1_7_1","unstructured":"C. Li X. Ma B. Jiang X. Li X. Zhang X. Liu Y. Cao A. Kannan Z. Zhu Deep speaker: an end-to-end neural speaker embedding system arXiv preprint arXiv:1705.02304 (2017).  C. Li X. Ma B. Jiang X. Li X. Zhang X. Liu Y. Cao A. Kannan Z. Zhu Deep speaker: an end-to-end neural speaker embedding system arXiv preprint arXiv:1705.02304 (2017)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462665"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/MLSP.2016.7738816"},{"key":"e_1_3_2_1_10_1","unstructured":"D. Palaz R. Collobert M. M. Doss End-to-end phoneme sequence recognition using convolutional neural networks arXiv preprint arXiv:1312.2137 (2013).  D. Palaz R. Collobert M. M. Doss End-to-end phoneme sequence recognition using convolutional neural networks arXiv preprint arXiv:1312.2137 (2013)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178847"},{"volume-title":"Sixteenth Annual Conference of the International Speech Communication Association","year":"2015","author":"Sainath R. J.","key":"e_1_3_2_1_12_1"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"N. Zeghidour N. Usunier G. Synnaeve R. Collobert E. Dupoux End-to-end speech recognition from the raw waveform arXiv preprint arXiv:1806.07098 (2018).  N. Zeghidour N. Usunier G. Synnaeve R. Collobert E. Dupoux End-to-end speech recognition from the raw waveform arXiv preprint arXiv:1806.07098 (2018).","DOI":"10.21437\/Interspeech.2018-2414"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462575"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2010.2064307"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Y. Xu I. McLoughlin Y. Song K. Wu Improved i-vector representation for speaker diarization Circuits Systems and Signal Processing 35 (2016) 3393--3404. 10.1007\/s00034-015-0206-2  Y. Xu I. McLoughlin Y. Song K. Wu Improved i-vector representation for speaker diarization Circuits Systems and Signal Processing 35 (2016) 3393--3404. 10.1007\/s00034-015-0206-2","DOI":"10.1007\/s00034-015-0206-2"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1050"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2017.2687041"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953204"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472652"},{"key":"e_1_3_2_1_22_1","unstructured":"v. d. Oord S. Dieleman H. Zen K. Simonyan O. Vinyals A. Graves N. Kalchbrenner A. Senior K. Kavukcuoglu Wavenet: A generative model for raw audio arXiv preprint arXiv:1609.03499 (2016).  v. d. Oord S. Dieleman H. Zen K. Simonyan O. Vinyals A. Graves N. Kalchbrenner A. Senior K. Kavukcuoglu Wavenet: A generative model for raw audio arXiv preprint arXiv:1609.03499 (2016)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639585"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"J.-w. Jung H.-S. Heo J.-h. Kim H.-j. Shim H.-J. Yu Rawnet: Advanced end-to-end deep neural network using raw waveforms for text-independent speaker verification arXiv preprint arXiv:1904.08104 (2019).  J.-w. Jung H.-S. Heo J.-h. Kim H.-j. Shim H.-J. Yu Rawnet: Advanced end-to-end deep neural network using raw waveforms for text-independent speaker verification arXiv preprint arXiv:1904.08104 (2019).","DOI":"10.21437\/Interspeech.2019-1982"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2016.7846260"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"A.-r. Mohamed G. Hinton G. Penn Understanding how deep belief networks perform acoustic modelling neural networks (2012) 6--9.  A.-r. Mohamed G. Hinton G. Penn Understanding how deep belief networks perform acoustic modelling neural networks (2012) 6--9.","DOI":"10.1109\/ICASSP.2012.6288863"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461375"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2007.4409052"},{"key":"e_1_3_2_1_29_1","unstructured":"D. Ulyanov A. Vedaldi V. Lempitsky Instance normalization: The missing ingredient for fast stylization arXiv preprint arXiv:1607.08022 (2016).  D. Ulyanov A. Vedaldi V. Lempitsky Instance normalization: The missing ingredient for fast stylization arXiv preprint arXiv:1607.08022 (2016)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298682"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2018.2822810"},{"key":"e_1_3_2_1_33_1","unstructured":"J. Yamagishi C. Veaux K. MacDonald et al. Cstr vctk corpus: English multi-speaker corpus for cstr voice cloning toolkit (version 0.92) (2019).  J. Yamagishi C. Veaux K. MacDonald et al. Cstr vctk corpus: English multi-speaker corpus for cstr voice cloning toolkit (version 0.92) (2019)."}],"event":{"name":"ICCAI '20: 2020 6th International Conference on Computing and Artificial Intelligence","sponsor":["University of Tsukuba University of Tsukuba"],"location":"Tianjin China","acronym":"ICCAI '20"},"container-title":["Proceedings of the 2020 6th International Conference on Computing and Artificial Intelligence"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3404555.3404571","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3404555.3404571","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T22:38:59Z","timestamp":1750199939000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3404555.3404571"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,4,23]]},"references-count":32,"alternative-id":["10.1145\/3404555.3404571","10.1145\/3404555"],"URL":"https:\/\/doi.org\/10.1145\/3404555.3404571","relation":{},"subject":[],"published":{"date-parts":[[2020,4,23]]},"assertion":[{"value":"2020-08-20","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}