{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T05:10:10Z","timestamp":1755839410751,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":49,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,10,10]],"date-time":"2022-10-10T00:00:00Z","timestamp":1665360000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"The National Natural Science Foundation of China","award":["6197071246, 61806130, 62002239"],"award-info":[{"award-number":["6197071246, 61806130, 62002239"]}]},{"name":"The National Key R&D Program of China","award":["2020YFA0908700"],"award-info":[{"award-number":["2020YFA0908700"]}]},{"name":"The Natural Science Foundation of Guangdong Province","award":["2021A1515011153"],"award-info":[{"award-number":["2021A1515011153"]}]},{"name":"Shenzhen Science and Technology Innovation Commission","award":["20200805142159001"],"award-info":[{"award-number":["20200805142159001"]}]},{"name":"The Guangdong Pearl River Talent Recruitment Program","award":["2019ZT08X603"],"award-info":[{"award-number":["2019ZT08X603"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,10,10]]},"DOI":"10.1145\/3503161.3548240","type":"proceedings-article","created":{"date-parts":[[2022,10,10]],"date-time":"2022-10-10T15:42:35Z","timestamp":1665416555000},"page":"6737-6745","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Phoneme-Aware Adaptation with Discrepancy Minimization and Dynamically-Classified Vector for Text-independent Speaker Verification"],"prefix":"10.1145","author":[{"given":"Jia","family":"Wang","sequence":"first","affiliation":[{"name":"Shenzhen University, Shenzhen, China"}]},{"given":"Tianhao","family":"Lan","sequence":"additional","affiliation":[{"name":"Shenzhen University, Shenzhen, China"}]},{"given":"Jie","family":"Chen","sequence":"additional","affiliation":[{"name":"Shenzhen University, Shenzhen, China"}]},{"given":"Chengwen","family":"Luo","sequence":"additional","affiliation":[{"name":"Shenzhen University, Shenzhen, China"}]},{"given":"Chao","family":"Wu","sequence":"additional","affiliation":[{"name":"Shenzhen University, Shenzhen, China"}]},{"given":"Jianqiang","family":"Li","sequence":"additional","affiliation":[{"name":"Shenzhen University, Shenzhen, China"}]}],"member":"320","published-online":{"date-parts":[[2022,10,10]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Kernels for vector-valued functions: A review. arXiv preprint arXiv:1106.6251","author":"Alvarez Mauricio A","year":"2011","unstructured":"Mauricio A Alvarez , Lorenzo Rosasco , and Neil D Lawrence . 2011. Kernels for vector-valued functions: A review. arXiv preprint arXiv:1106.6251 ( 2011 ). Mauricio A Alvarez, Lorenzo Rosasco, and Neil D Lawrence. 2011. Kernels for vector-valued functions: A review. arXiv preprint arXiv:1106.6251 (2011)."},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413907"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"crossref","unstructured":"Nanxin Chen Yanmin Qian and Kai Yu. 2015. Multi-task learning for text-dependent speaker verification. In Sixteenth annual conference of the international speech communication association.  Nanxin Chen Yanmin Qian and Kai Yu. 2015. Multi-task learning for text-dependent speaker verification. In Sixteenth annual conference of the international speech communication association.","DOI":"10.21437\/Interspeech.2015-81"},{"key":"e_1_3_2_2_4_1","volume-title":"Voxceleb2: Deep speaker recognition. arXiv preprint arXiv:1806.05622","author":"Chung Joon Son","year":"2018","unstructured":"Joon Son Chung , Arsha Nagrani , and Andrew Zisserman . 2018. Voxceleb2: Deep speaker recognition. arXiv preprint arXiv:1806.05622 ( 2018 ). Joon Son Chung, Arsha Nagrani, and Andrew Zisserman. 2018. Voxceleb2: Deep speaker recognition. arXiv preprint arXiv:1806.05622 (2018)."},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2010.2064307"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00482"},{"key":"e_1_3_2_2_7_1","volume-title":"Ecapa-tdnn: Emphasized channel attention, propagation and aggregation in tdnn based speaker verification. arXiv preprint arXiv:2005.07143","author":"Desplanques Brecht","year":"2020","unstructured":"Brecht Desplanques , Jenthe Thienpondt , and Kris Demuynck . 2020 . Ecapa-tdnn: Emphasized channel attention, propagation and aggregation in tdnn based speaker verification. arXiv preprint arXiv:2005.07143 (2020). Brecht Desplanques, Jenthe Thienpondt, and Kris Demuynck. 2020. Ecapa-tdnn: Emphasized channel attention, propagation and aggregation in tdnn based speaker verification. arXiv preprint arXiv:2005.07143 (2020)."},{"key":"e_1_3_2_2_8_1","first-page":"723","article-title":"A kernel two-sample test","volume":"13","author":"Gretton Arthur","year":"2012","unstructured":"Arthur Gretton , Karsten M Borgwardt , Malte J Rasch , Bernhard Sch\u00f6lkopf , and Alexander Smola . 2012 . A kernel two-sample test . The Journal of Machine Learning Research , Vol. 13 , 1 (2012), 723 -- 773 . Arthur Gretton, Karsten M Borgwardt, Malte J Rasch, Bernhard Sch\u00f6lkopf, and Alexander Smola. 2012. A kernel two-sample test. The Journal of Machine Learning Research, Vol. 13, 1 (2012), 723--773.","journal-title":"The Journal of Machine Learning Research"},{"key":"e_1_3_2_2_9_1","volume-title":"Unified hypersphere embedding for speaker recognition. arXiv preprint arXiv:1807.08312","author":"Hajibabaei Mahdi","year":"2018","unstructured":"Mahdi Hajibabaei and Dengxin Dai . 2018. Unified hypersphere embedding for speaker recognition. arXiv preprint arXiv:1807.08312 ( 2018 ). Mahdi Hajibabaei and Dengxin Dai. 2018. Unified hypersphere embedding for speaker recognition. arXiv preprint arXiv:1807.08312 (2018)."},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472652"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"crossref","unstructured":"Geoffrey Hinton Li Deng Dong Yu George E Dahl Abdel-rahman Mohamed Navdeep Jaitly Andrew Senior Vincent Vanhoucke Patrick Nguyen Tara N Sainath etal 2012. Deep neural networks for acoustic modeling in speech recognition: The shared views of four research groups. IEEE Signal processing magazine Vol. 29 6 (2012) 82--97.  Geoffrey Hinton Li Deng Dong Yu George E Dahl Abdel-rahman Mohamed Navdeep Jaitly Andrew Senior Vincent Vanhoucke Patrick Nguyen Tara N Sainath et al. 2012. Deep neural networks for acoustic modeling in speech recognition: The shared views of four research groups. IEEE Signal processing magazine Vol. 29 6 (2012) 82--97.","DOI":"10.1109\/MSP.2012.2205597"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00594"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"crossref","unstructured":"Zili Huang Shuai Wang and Kai Yu. 2018. Angular Softmax for Short-Duration Text-independent Speaker Verification.. In Interspeech. 3623--3627.  Zili Huang Shuai Wang and Kai Yu. 2018. Angular Softmax for Short-Duration Text-independent Speaker Verification.. In Interspeech. 3623--3627.","DOI":"10.21437\/Interspeech.2018-1545"},{"key":"e_1_3_2_2_15_1","volume-title":"Self multi-head attention for speaker recognition. arXiv preprint arXiv:1906.09890","author":"India Miquel","year":"2019","unstructured":"Miquel India , Pooyan Safari , and Javier Hernando . 2019. Self multi-head attention for speaker recognition. arXiv preprint arXiv:1906.09890 ( 2019 ). Miquel India, Pooyan Safari, and Javier Hernando. 2019. Self multi-head attention for speaker recognition. arXiv preprint arXiv:1906.09890 (2019)."},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1007\/11744085_41"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953152"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2014.6853887"},{"key":"e_1_3_2_2_19_1","volume-title":"Deep speaker: an end-to-end neural speaker embedding system. arXiv preprint arXiv:1705.02304","author":"Li Chao","year":"2017","unstructured":"Chao Li , Xiaokong Ma , Bing Jiang , Xiangang Li , Xuewei Zhang , Xiao Liu , Ying Cao , Ajay Kannan , and Zhenyao Zhu . 2017. Deep speaker: an end-to-end neural speaker embedding system. arXiv preprint arXiv:1705.02304 ( 2017 ). Chao Li, Xiaokong Ma, Bing Jiang, Xiangang Li, Xuewei Zhang, Xiao Liu, Ying Cao, Ajay Kannan, and Zhenyao Zhu. 2017. Deep speaker: an end-to-end neural speaker embedding system. arXiv preprint arXiv:1705.02304 (2017)."},{"key":"e_1_3_2_2_20_1","volume-title":"Real Additive Margin Softmax for Speaker Verification. arXiv preprint arXiv:2110.09116","author":"Li Lantian","year":"2021","unstructured":"Lantian Li , Ruiqian Nai , and Dong Wang . 2021. Real Additive Margin Softmax for Speaker Verification. arXiv preprint arXiv:2110.09116 ( 2021 ). Lantian Li, Ruiqian Nai, and Dong Wang. 2021. Real Additive Margin Softmax for Speaker Verification. arXiv preprint arXiv:2110.09116 (2021)."},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2020-61"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.324"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054134"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.713"},{"key":"e_1_3_2_2_25_1","volume-title":"Large margin softmax loss for speaker verification. arXiv preprint arXiv:1904.03479","author":"Liu Yi","year":"2019","unstructured":"Yi Liu , Liang He , and Jia Liu . 2019. Large margin softmax loss for speaker verification. arXiv preprint arXiv:1904.03479 ( 2019 ). Yi Liu, Liang He, and Jia Liu. 2019. Large margin softmax loss for speaker verification. arXiv preprint arXiv:1904.03479 (2019)."},{"key":"e_1_3_2_2_26_1","volume-title":"Speaker embedding extraction with phonetic information. arXiv preprint arXiv:1804.04862","author":"Liu Yi","year":"2018","unstructured":"Yi Liu , Liang He , Jia Liu , and Michael T Johnson . 2018. Speaker embedding extraction with phonetic information. arXiv preprint arXiv:1804.04862 ( 2018 ). Yi Liu, Liang He, Jia Liu, and Michael T Johnson. 2018. Speaker embedding extraction with phonetic information. arXiv preprint arXiv:1804.04862 (2018)."},{"key":"e_1_3_2_2_27_1","volume-title":"Phoneme-aware and channel-wise attentive learning for text dependentspeaker verification. arXiv preprint arXiv:2106.13514","author":"Liu Yan","year":"2021","unstructured":"Yan Liu , Zheng Li , Lin Li , and Qingyang Hong . 2021. Phoneme-aware and channel-wise attentive learning for text dependentspeaker verification. arXiv preprint arXiv:2106.13514 ( 2021 ). Yan Liu, Zheng Li, Lin Li, and Qingyang Hong. 2021. Phoneme-aware and channel-wise attentive learning for text dependentspeaker verification. arXiv preprint arXiv:2106.13514 (2021)."},{"key":"e_1_3_2_2_28_1","volume-title":"Joon Son Chung, and Andrew Zisserman","author":"Nagrani Arsha","year":"2017","unstructured":"Arsha Nagrani , Joon Son Chung, and Andrew Zisserman . 2017 . Voxceleb : a large-scale speaker identification dataset. arXiv preprint arXiv:1706.08612 (2017). Arsha Nagrani, Joon Son Chung, and Andrew Zisserman. 2017. Voxceleb: a large-scale speaker identification dataset. arXiv preprint arXiv:1706.08612 (2017)."},{"key":"e_1_3_2_2_29_1","volume-title":"Attentive statistics pooling for deep speaker embedding. arXiv preprint arXiv:1803.10963","author":"Okabe Koji","year":"2018","unstructured":"Koji Okabe , Takafumi Koshinaka , and Koichi Shinoda . 2018. Attentive statistics pooling for deep speaker embedding. arXiv preprint arXiv:1803.10963 ( 2018 ). Koji Okabe, Takafumi Koshinaka, and Koichi Shinoda. 2018. Attentive statistics pooling for deep speaker embedding. arXiv preprint arXiv:1803.10963 (2018)."},{"key":"e_1_3_2_2_30_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke , Sam Gross , Francisco Massa , Adam Lerer , James Bradbury , Gregory Chanan , Trevor Killeen , Zeming Lin , Natalia Gimelshein , Luca Antiga , 2019 . Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems , Vol. 32 (2019), 8026--8037. Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. 2019. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems, Vol. 32 (2019), 8026--8037."},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"crossref","unstructured":"Vijayaditya Peddinti Daniel Povey and Sanjeev Khudanpur. 2015. A time delay neural network architecture for efficient modeling of long temporal contexts. In Sixteenth annual conference of the international speech communication association.  Vijayaditya Peddinti Daniel Povey and Sanjeev Khudanpur. 2015. A time delay neural network architecture for efficient modeling of long temporal contexts. In Sixteenth annual conference of the international speech communication association.","DOI":"10.21437\/Interspeech.2015-647"},{"key":"e_1_3_2_2_32_1","volume-title":"IEEE 2011 workshop on automatic speech recognition and understanding. IEEE Signal Processing Society.","author":"Povey Daniel","year":"2011","unstructured":"Daniel Povey , Arnab Ghoshal , Gilles Boulianne , Lukas Burget , Ondrej Glembek , Nagendra Goel , Mirko Hannemann , Petr Motlicek , Yanmin Qian , Petr Schwarz , 2011 . The Kaldi speech recognition toolkit . In IEEE 2011 workshop on automatic speech recognition and understanding. IEEE Signal Processing Society. Daniel Povey, Arnab Ghoshal, Gilles Boulianne, Lukas Burget, Ondrej Glembek, Nagendra Goel, Mirko Hannemann, Petr Motlicek, Yanmin Qian, Petr Schwarz, et al. 2011. The Kaldi speech recognition toolkit. In IEEE 2011 workshop on automatic speech recognition and understanding. IEEE Signal Processing Society."},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.89"},{"key":"e_1_3_2_2_34_1","volume-title":"Musan: A music, speech, and noise corpus. arXiv preprint arXiv:1510.08484","author":"Snyder David","year":"2015","unstructured":"David Snyder , Guoguo Chen , and Daniel Povey . 2015 . Musan: A music, speech, and noise corpus. arXiv preprint arXiv:1510.08484 (2015). David Snyder, Guoguo Chen, and Daniel Povey. 2015. Musan: A music, speech, and noise corpus. arXiv preprint arXiv:1510.08484 (2015)."},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"crossref","unstructured":"David Snyder Daniel Garcia-Romero Daniel Povey and Sanjeev Khudanpur. 2017. Deep neural network embeddings for text-independent speaker verification.. In Interspeech. 999--1003.  David Snyder Daniel Garcia-Romero Daniel Povey and Sanjeev Khudanpur. 2017. Deep neural network embeddings for text-independent speaker verification.. In Interspeech. 999--1003.","DOI":"10.21437\/Interspeech.2017-620"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683760"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461375"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2016.2639323"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053871"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414676"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2014.6854363"},{"key":"e_1_3_2_2_42_1","volume-title":"Francc ois Grondin, et al","author":"Villalba Jes\u00fas","year":"2019","unstructured":"Jes\u00fas Villalba , Nanxin Chen , David Snyder , Daniel Garcia-Romero , Alan McCree , Gregory Sell , Jonas Borgstrom , Fred Richardson , Suwon Shon , Francc ois Grondin, et al . 2019 . State-of-the-Art Speaker Recognition for Telephone and Video Speech: The JHU-MIT Submission for NIST SRE18.. In Interspeech . 1488--1492. Jes\u00fas Villalba, Nanxin Chen, David Snyder, Daniel Garcia-Romero, Alan McCree, Gregory Sell, Jonas Borgstrom, Fred Richardson, Suwon Shon, Francc ois Grondin, et al. 2019. State-of-the-Art Speaker Recognition for Telephone and Video Speech: The JHU-MIT Submission for NIST SRE18.. In Interspeech. 1488--1492."},{"key":"e_1_3_2_2_43_1","volume-title":"Thchs-30: A free chinese speech corpus. arXiv preprint arXiv:1512.01882","author":"Wang Dong","year":"2015","unstructured":"Dong Wang and Xuewei Zhang . 2015. Thchs-30: A free chinese speech corpus. arXiv preprint arXiv:1512.01882 ( 2015 ). Dong Wang and Xuewei Zhang. 2015. Thchs-30: A free chinese speech corpus. arXiv preprint arXiv:1512.01882 (2015)."},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2018.2822810"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"crossref","unstructured":"Shuai Wang Johan Rohdin Luk\u00e1s Burget Oldrich Plchot Yanmin Qian Kai Yu and Jan Cernock\u1ef3. 2019. On the Usage of Phonetic Information for Text-Independent Speaker Embedding Extraction.. In Interspeech. 1148--1152.  Shuai Wang Johan Rohdin Luk\u00e1s Burget Oldrich Plchot Yanmin Qian Kai Yu and Jan Cernock\u1ef3. 2019. On the Usage of Phonetic Information for Text-Independent Speaker Embedding Extraction.. In Interspeech. 1148--1152.","DOI":"10.21437\/Interspeech.2019-3036"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6906"},{"key":"e_1_3_2_2_47_1","volume-title":"An experimental study on speech enhancement based on deep neural networks","author":"Xu Yong","year":"2013","unstructured":"Yong Xu , Jun Du , Li-Rong Dai , and Chin-Hui Lee . 2013. An experimental study on speech enhancement based on deep neural networks . IEEE Signal processing letters, Vol. 21 , 1 ( 2013 ), 65--68. Yong Xu, Jun Du, Li-Rong Dai, and Chin-Hui Lee. 2013. An experimental study on speech enhancement based on deep neural networks. IEEE Signal processing letters, Vol. 21, 1 (2013), 65--68."},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"crossref","unstructured":"Chunlei Zhang and Kazuhito Koishida. 2017. End-to-end text-independent speaker verification with triplet loss on short utterances. In Interspeech. 1487--1491.  Chunlei Zhang and Kazuhito Koishida. 2017. End-to-end text-independent speaker verification with triplet loss on short utterances. In Interspeech. 1487--1491.","DOI":"10.21437\/Interspeech.2017-1608"},{"key":"e_1_3_2_2_49_1","volume-title":"Yibo Wu, Meng Liu, Jianwu Dang, and Jianguo Wei.","author":"Zhou Dao","year":"2020","unstructured":"Dao Zhou , Longbiao Wang , Kong Aik Lee , Yibo Wu, Meng Liu, Jianwu Dang, and Jianguo Wei. 2020 . Dynamic Margin Softmax Loss for Speaker Verification.. In INTERSPEECH. 3800--3804. Dao Zhou, Longbiao Wang, Kong Aik Lee, Yibo Wu, Meng Liu, Jianwu Dang, and Jianguo Wei. 2020. Dynamic Margin Softmax Loss for Speaker Verification.. In INTERSPEECH. 3800--3804."}],"event":{"name":"MM '22: The 30th ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Lisboa Portugal","acronym":"MM '22"},"container-title":["Proceedings of the 30th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503161.3548240","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3503161.3548240","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:00:21Z","timestamp":1750186821000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503161.3548240"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,10,10]]},"references-count":49,"alternative-id":["10.1145\/3503161.3548240","10.1145\/3503161"],"URL":"https:\/\/doi.org\/10.1145\/3503161.3548240","relation":{},"subject":[],"published":{"date-parts":[[2022,10,10]]},"assertion":[{"value":"2022-10-10","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}