{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,23]],"date-time":"2025-06-23T14:44:43Z","timestamp":1750689883091,"version":"3.40.3"},"publisher-location":"Cham","reference-count":34,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783030921842"},{"type":"electronic","value":"9783030921859"}],"license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021]]},"DOI":"10.1007\/978-3-030-92185-9_3","type":"book-chapter","created":{"date-parts":[[2021,12,5]],"date-time":"2021-12-05T17:02:46Z","timestamp":1638723766000},"page":"27-39","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Speaker Verification with\u00a0Disentangled Self-attention"],"prefix":"10.1007","author":[{"given":"Junjie","family":"Guo","sequence":"first","affiliation":[]},{"given":"Zhiyuan","family":"Ma","sequence":"additional","affiliation":[]},{"given":"Haodong","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Gongshen","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Xiaoyong","family":"Li","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,12,6]]},"reference":[{"key":"3_CR1","doi-asserted-by":"crossref","unstructured":"Bhattacharya, G., Alam, M.J., Kenny, P.: Deep speaker embeddings for short-duration speaker verification. In: Interspeech, pp. 1517\u20131521 (2017)","DOI":"10.21437\/Interspeech.2017-1575"},{"key":"3_CR2","doi-asserted-by":"crossref","unstructured":"Bu, H., Du, J., Na, X., Wu, B., Zheng, H.: AISHELL-1: an open-source mandarin speech corpus and a speech recognition baseline. In: 2017 20th Conference of the Oriental Chapter of the International Coordinating Committee on Speech Databases and Speech I\/O Systems and Assessment (O-COCOSDA), pp. 1\u20135. IEEE (2017)","DOI":"10.1109\/ICSDA.2017.8384449"},{"issue":"5","key":"3_CR3","doi-asserted-by":"publisher","first-page":"308","DOI":"10.1109\/LSP.2006.870086","volume":"13","author":"WM Campbell","year":"2006","unstructured":"Campbell, W.M., Sturim, D.E., Reynolds, D.A.: Support vector machines using GMM supervectors for speaker verification. IEEE Signal Process. Lett. 13(5), 308\u2013311 (2006)","journal-title":"IEEE Signal Process. Lett."},{"key":"3_CR4","doi-asserted-by":"crossref","unstructured":"Chan, W., Jaitly, N., Le, Q., Vinyals, O.: Listen, attend and spell: a neural network for large vocabulary conversational speech recognition. In: 2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4960\u20134964. IEEE (2016)","DOI":"10.1109\/ICASSP.2016.7472621"},{"key":"3_CR5","doi-asserted-by":"crossref","unstructured":"Chung, J.S., Nagrani, A., Zisserman, A.: VoxCeleb2: deep speaker recognition. arXiv preprint arXiv:1806.05622 (2018)","DOI":"10.21437\/Interspeech.2018-1929"},{"issue":"4","key":"3_CR6","doi-asserted-by":"publisher","first-page":"788","DOI":"10.1109\/TASL.2010.2064307","volume":"19","author":"N Dehak","year":"2010","unstructured":"Dehak, N., Kenny, P.J., Dehak, R., Dumouchel, P., Ouellet, P.: Front-end factor analysis for speaker verification. IEEE Trans. Audio Speech Lang. Process. 19(4), 788\u2013798 (2010)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"3_CR7","doi-asserted-by":"crossref","unstructured":"Garofolo, J.S., Lamel, L.F., Fisher, W.M., Fiscus, J.G., Pallett, D.S.: DARPA TIMIT acoustic-phonetic continuous speech corpus CD-ROM. NIST speech disc 1\u20131.1. STIN 93, 27403 (1993)","DOI":"10.6028\/NIST.IR.4930"},{"key":"3_CR8","doi-asserted-by":"crossref","unstructured":"Grawunder, S., Bose, I.: Average speaking pitch vs. average speaker fundamental frequency-reliability, homogeneity, and self report of listener groups. In: Proceedings of the International Conference Speech Prosody, pp. 763\u2013766 (2008)","DOI":"10.21437\/SpeechProsody.2008-170"},{"key":"3_CR9","doi-asserted-by":"crossref","unstructured":"Heigold, G., Moreno, I., Bengio, S., Shazeer, N.: End-to-end text-dependent speaker verification. In: 2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5115\u20135119. IEEE (2016)","DOI":"10.1109\/ICASSP.2016.7472652"},{"key":"3_CR10","doi-asserted-by":"crossref","unstructured":"India, M., Safari, P., Hernando, J.: Double multi-head attention for speaker verification. In: ICASSP 2021\u20132021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6144\u20136148. IEEE (2021)","DOI":"10.1109\/ICASSP39728.2021.9414877"},{"key":"3_CR11","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"531","DOI":"10.1007\/11744085_41","volume-title":"Computer Vision \u2013 ECCV 2006","author":"S Ioffe","year":"2006","unstructured":"Ioffe, S.: Probabilistic linear discriminant analysis. In: Leonardis, A., Bischof, H., Pinz, A. (eds.) ECCV 2006. LNCS, vol. 3954, pp. 531\u2013542. Springer, Heidelberg (2006). https:\/\/doi.org\/10.1007\/11744085_41"},{"key":"3_CR12","doi-asserted-by":"crossref","unstructured":"Li, J., Lee, T.: Text-independent speaker verification with dual attention network. In: Proceedings of the Interspeech 2020, pp. 956\u2013960 (2020)","DOI":"10.21437\/Interspeech.2020-2031"},{"key":"3_CR13","doi-asserted-by":"crossref","unstructured":"Li, Z., Zhao, M., Li, J., Li, L., Hong, Q.: On the usage of multi-feature integration for speaker verification and language identification. In: Proceedings of the Interspeech 2020, pp. 457\u2013461 (2020)","DOI":"10.21437\/Interspeech.2020-1960"},{"key":"3_CR14","unstructured":"Maaten, L.v.d., Hinton, G.: Visualizing data using t-SNE. J. Mach. Learn. Res. 9, 2579\u20132605 (2008)"},{"key":"3_CR15","doi-asserted-by":"publisher","unstructured":"McFee, B., et al.: librosa\/librosa: 0.6. 3 (2019). https:\/\/doi.org\/10.5281\/zenodo.2564164","DOI":"10.5281\/zenodo.2564164"},{"key":"3_CR16","doi-asserted-by":"crossref","unstructured":"Mo\u0161ner, L., Mat\u011bjka, P., Novotn\u1ef3, O., \u010cernock\u1ef3, J.H.: Dereverberation and beamforming in far-field speaker recognition. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5254\u20135258. IEEE (2018)","DOI":"10.1109\/ICASSP.2018.8462365"},{"key":"3_CR17","doi-asserted-by":"crossref","unstructured":"Nagrani, A., Chung, J.S., Zisserman, A.: VoxCeleb: a large-scale speaker identification dataset. arXiv preprint arXiv:1706.08612 (2017)","DOI":"10.21437\/Interspeech.2017-950"},{"key":"3_CR18","doi-asserted-by":"crossref","unstructured":"Okabe, K., Koshinaka, T., Shinoda, K.: Attentive statistics pooling for deep speaker embedding. In: Proceedings of the Interspeech 2018, pp. 2252\u20132256 (2018)","DOI":"10.21437\/Interspeech.2018-993"},{"key":"3_CR19","doi-asserted-by":"crossref","unstructured":"Qi, M., Yu, Y., Tang, Y., Deng, Q., Mai, F., Zhaxi, N.: Deep CNN with se block for speaker recognition. In: 2020 Information Communication Technologies Conference (ICTC), pp. 240\u2013244. IEEE (2020)","DOI":"10.1109\/ICTC49638.2020.9123307"},{"issue":"1\u20133","key":"3_CR20","doi-asserted-by":"publisher","first-page":"19","DOI":"10.1006\/dspr.1999.0361","volume":"10","author":"DA Reynolds","year":"2000","unstructured":"Reynolds, D.A., Quatieri, T.F., Dunn, R.B.: Speaker verification using adapted gaussian mixture models. Digit. Signal Process. 10(1\u20133), 19\u201341 (2000)","journal-title":"Digit. Signal Process."},{"key":"3_CR21","doi-asserted-by":"crossref","unstructured":"Safari, P., Hernando, J.: Self-attention encoding and pooling for speaker recognition. In: Proceedings of the Interspeech 2020, pp. 941\u2013945 (2020)","DOI":"10.21437\/Interspeech.2020-1446"},{"key":"3_CR22","doi-asserted-by":"crossref","unstructured":"Sankala, S., Rafi, B.S.M., Kodukula, S.R.M.: Self attentive context dependent speaker embedding for speaker verification. In: 2020 National Conference on Communications (NCC), pp. 1\u20135. IEEE (2020)","DOI":"10.1109\/NCC48643.2020.9056043"},{"key":"3_CR23","doi-asserted-by":"crossref","unstructured":"Snyder, D., Garcia-Romero, D., Povey, D., Khudanpur, S.: Deep neural network embeddings for text-independent speaker verification. In: Interspeech, pp. 999\u20131003 (2017)","DOI":"10.21437\/Interspeech.2017-620"},{"key":"3_CR24","doi-asserted-by":"crossref","unstructured":"Snyder, D., Garcia-Romero, D., Sell, G., McCree, A., Povey, D., Khudanpur, S.: Speaker recognition for multi-speaker conversations using x-vectors. In: ICASSP 2019\u20132019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5796\u20135800. IEEE (2019)","DOI":"10.1109\/ICASSP.2019.8683760"},{"key":"3_CR25","doi-asserted-by":"crossref","unstructured":"Snyder, D., Garcia-Romero, D., Sell, G., Povey, D., Khudanpur, S.: X-vectors: robust DNN embeddings for speaker recognition. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5329\u20135333. IEEE (2018)","DOI":"10.1109\/ICASSP.2018.8461375"},{"key":"3_CR26","doi-asserted-by":"crossref","unstructured":"Snyder, D., Ghahremani, P., Povey, D., Garcia-Romero, D., Carmiel, Y., Khudanpur, S.: Deep neural network-based speaker embeddings for end-to-end speaker verification. In: 2016 IEEE Spoken Language Technology Workshop (SLT), pp. 165\u2013170. IEEE (2016)","DOI":"10.1109\/SLT.2016.7846260"},{"key":"3_CR27","doi-asserted-by":"crossref","unstructured":"Variani, E., Lei, X., McDermott, E., Moreno, I.L., Gonzalez-Dominguez, J.: Deep neural networks for small footprint text-dependent speaker verification. In: 2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4052\u20134056. IEEE (2014)","DOI":"10.1109\/ICASSP.2014.6854363"},{"key":"3_CR28","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, pp. 5998\u20136008 (2017)"},{"key":"3_CR29","doi-asserted-by":"crossref","unstructured":"Villalba, J., et al.: State-of-the-art speaker recognition for telephone and video speech: the JHU-MIT submission for NIST SRE18. In: Interspeech, pp. 1488\u20131492 (2019)","DOI":"10.21437\/Interspeech.2019-2713"},{"key":"3_CR30","doi-asserted-by":"crossref","unstructured":"Wan, L., Wang, Q., Papir, A., Moreno, I.L.: Generalized end-to-end loss for speaker verification. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4879\u20134883. IEEE (2018)","DOI":"10.1109\/ICASSP.2018.8462665"},{"key":"3_CR31","doi-asserted-by":"crossref","unstructured":"Wang, X., Girshick, R., Gupta, A., He, K.: Non-local neural networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7794\u20137803 (2018)","DOI":"10.1109\/CVPR.2018.00813"},{"key":"3_CR32","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"191","DOI":"10.1007\/978-3-030-58555-6_12","volume-title":"Computer Vision \u2013 ECCV 2020","author":"M Yin","year":"2020","unstructured":"Yin, M., et al.: Disentangled non-local neural networks. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12360, pp. 191\u2013207. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58555-6_12"},{"key":"3_CR33","doi-asserted-by":"crossref","unstructured":"Zhang, C., Koishida, K.: End-to-end text-independent speaker verification with triplet loss on short utterances. In: Interspeech, pp. 1487\u20131491 (2017)","DOI":"10.21437\/Interspeech.2017-1608"},{"key":"3_CR34","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Ko, T., Snyder, D., Mak, B., Povey, D.: Self-attentive speaker embeddings for text-independent speaker verification. In: Proceedings of the Interspeech 2018, pp. 3573\u20133577 (2018)","DOI":"10.21437\/Interspeech.2018-1158"}],"container-title":["Lecture Notes in Computer Science","Neural Information Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-92185-9_3","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,13]],"date-time":"2024-03-13T18:40:28Z","timestamp":1710355228000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-92185-9_3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"ISBN":["9783030921842","9783030921859"],"references-count":34,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-92185-9_3","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2021]]},"assertion":[{"value":"6 December 2021","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICONIP","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Neural Information Processing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Sanur, Bali","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Indonesia","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2021","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 December 2021","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"12 December 2021","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"iconip2021","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/iconip2021.apnns.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Single-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"EasyChair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1093","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"226","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"177","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"21% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"2.57","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"6","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Due to the COVID-19 pandemic the conference was held online.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}