{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T15:26:41Z","timestamp":1775230001357,"version":"3.50.1"},"reference-count":58,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"RGC of Hong Kong SAR","award":["PolyU 15210122"],"award-info":[{"award-number":["PolyU 15210122"]}]},{"name":"NSTC of Taiwan","award":["112-2634-F-A49-006"],"award-info":[{"award-number":["112-2634-F-A49-006"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE\/ACM Trans. Audio Speech Lang. Process."],"published-print":{"date-parts":[[2024]]},"DOI":"10.1109\/taslp.2024.3402077","type":"journal-article","created":{"date-parts":[[2024,5,16]],"date-time":"2024-05-16T17:33:01Z","timestamp":1715880781000},"page":"2704-2715","source":"Crossref","is-referenced-by-count":12,"title":["Contrastive Self-Supervised Speaker Embedding With Sequential Disentanglement"],"prefix":"10.1109","volume":"32","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9580-2414","authenticated-orcid":false,"given":"Youzhi","family":"Tu","sequence":"first","affiliation":[{"name":"Department of Electrical and Electronic Engineering, Hong Kong Polytechnic University, Hong Kong SAR, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8854-3760","authenticated-orcid":false,"given":"Man-Wai","family":"Mak","sequence":"additional","affiliation":[{"name":"Department of Electrical and Electronic Engineering, Hong Kong Polytechnic University, Hong Kong SAR, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3466-8941","authenticated-orcid":false,"given":"Jen-Tzung","family":"Chien","sequence":"additional","affiliation":[{"name":"Institute of Electrical and Computer Engineering, National Yang Ming Chiao Tung University, Hsinchu, Taiwan"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461375"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683120"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2650"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1158"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414094"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2022.3153267"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2018.2822810"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00482"},{"key":"ref9","article-title":"The IDLAB voxceleb speaker recognition challenge 2020 system description","author":"Thienpondt","year":"2020"},{"key":"ref10","article-title":"Augmentation adversarial training for self-supervised speaker recognition","volume-title":"Proc. Self-Supervised Learn. Speech Audio Process. NeurIPS Workshops","author":"Huh","year":"2020"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414713"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413351"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414973"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747162"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-742"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-11141"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096915"},{"key":"ref18","first-page":"5628","article-title":"A theoretical analysis of contrastive unsupervised representation learning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Arora","year":"2019"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2019.101027"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2013.50"},{"key":"ref21","article-title":"Towards a definition of disentangled representations","author":"Higgins","year":"2020"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9003979"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2020-28"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747778"},{"key":"ref25","first-page":"4114","article-title":"Challenging common assumptions in the unsupervised learning of disentangled representations","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Locatello","year":"2018"},{"key":"ref26","first-page":"5670","article-title":"Disentangled sequential autoencoder","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li","year":"2018"},{"key":"ref27","first-page":"10105","article-title":"Contrastively disentangled sequential variational autoencoder","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Bai","year":"2021"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ISCSLP57327.2022.10037896"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2012.2237151"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448284"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.5555\/3524938.3525087"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"ref34","article-title":"On the duality between contrastive and non-contrastive self-supervised learning","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Garrido","year":"2023"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054601"},{"key":"ref36","first-page":"5092","article-title":"Unsupervised adversarial invariance","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Jaiswal","year":"2018"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2019-1837"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10643"},{"key":"ref39","first-page":"21065","article-title":"Self-supervised neural factor analysis for disentangling utterance-level speech representations","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Lin","year":"2023"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref41","first-page":"18003","article-title":"ContentVec: An improved self-supervised speech representation by disentangling speakers","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Qian","year":"2022"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1142\/9789812797926_0003"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01549"},{"key":"ref44","article-title":"Representation learning with contrastive predictive coding","author":"van den Oord","year":"2018"},{"key":"ref45","article-title":"Auto-encoding variational bayes","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Kingma","year":"2014"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref47","article-title":"On mutual information maximization for representation learning","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Tschannen","year":"2020"},{"key":"ref48","article-title":"Derivations for linear algebra and optimization","author":"Duchi"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2022.01.002"},{"key":"ref50","article-title":"VoxSRC 2021: The third voxceleb speaker recognition challenge","author":"Brown","year":"2022"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383459"},{"key":"ref52","article-title":"MUSAN: A music, speech, and noise corpus","author":"Snyder"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1436"},{"key":"ref54","article-title":"Adam: A method for stochastic optimization","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Kingma","year":"2015"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10278"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1226"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-3036"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683760"}],"container-title":["IEEE\/ACM Transactions on Audio, Speech, and Language Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6570655\/10304349\/10531230.pdf?arnumber=10531230","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,5,24]],"date-time":"2024-05-24T17:26:29Z","timestamp":1716571589000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10531230\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"references-count":58,"URL":"https:\/\/doi.org\/10.1109\/taslp.2024.3402077","relation":{},"ISSN":["2329-9290","2329-9304"],"issn-type":[{"value":"2329-9290","type":"print"},{"value":"2329-9304","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]}}}