{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:13:12Z","timestamp":1750219992932,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":23,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,10,21]],"date-time":"2022-10-21T00:00:00Z","timestamp":1666310400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,10,21]]},"DOI":"10.1145\/3573428.3573659","type":"proceedings-article","created":{"date-parts":[[2023,3,15]],"date-time":"2023-03-15T10:43:09Z","timestamp":1678876989000},"page":"1300-1305","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Analyzing speaker information in self-supervised models to improve unsupervised speech recognition"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0407-7844","authenticated-orcid":false,"given":"Sirui","family":"Li","sequence":"first","affiliation":[{"name":"Key Laboratory of China's Ethnic Languages and Information Technology of Ministry of Education, Northwest Minzu University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6420-6062","authenticated-orcid":false,"given":"Qinya","family":"Zhang","sequence":"additional","affiliation":[{"name":"Key Laboratory of China's Ethnic Languages and Information Technology of Ministry of Education, Northwest Minzu University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4217-890X","authenticated-orcid":false,"given":"Yunpeng","family":"Li","sequence":"additional","affiliation":[{"name":"Key Laboratory of China's Ethnic Languages and Information Technology of Ministry of Education, Northwest Minzu University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9639-5825","authenticated-orcid":false,"given":"Guanyu","family":"Li","sequence":"additional","affiliation":[{"name":"Key Laboratory of China's Ethnic Languages and Information Technology of Ministry of Education, Northwest Minzu University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5556-4167","authenticated-orcid":false,"given":"Senyan","family":"Li","sequence":"additional","affiliation":[{"name":"Key Laboratory of China's Ethnic Languages and Information Technology of Ministry of Education, Northwest Minzu University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2565-9971","authenticated-orcid":false,"given":"Shaoxuan","family":"Wang","sequence":"additional","affiliation":[{"name":"Conservatory of Music, Northwest Minzu University, China"}]}],"member":"320","published-online":{"date-parts":[[2023,3,15]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Chiu C C","author":"Gulati A","year":"2005","unstructured":"Gulati A, Qin J, Chiu C C, Conformer: Convolution-augmented transformer for speech recognition [J]. arXiv preprint arXiv: 2005. 08100, 2020."},{"key":"e_1_3_2_1_2_1","volume-title":"Zhang Y","author":"Han W","year":"2005","unstructured":"Han W, Zhang Z, Zhang Y, Contextnet: Improving convolutional neural networks for automatic speech recognition with global context [J]. arXiv preprint arXiv: 2005. 03191, 2020."},{"key":"e_1_3_2_1_3_1","volume-title":"Completely unsupervised phoneme recognition by adversarially learning mapping relationships from audio embeddings [J]. arXiv preprint arXiv:1804.00316","author":"Liu D R","year":"2018","unstructured":"Liu D R, Chen K Y, Lee H, Completely unsupervised phoneme recognition by adversarially learning mapping relationships from audio embeddings [J]. arXiv preprint arXiv:1804.00316, 2018."},{"key":"e_1_3_2_1_4_1","volume-title":"Unsupervised speech recognition via segmental empirical output distribution matching [J]. arXiv preprint arXiv:1812.09323","author":"Yeh C K","year":"2018","unstructured":"Yeh C K, Chen J, Yu C, Unsupervised speech recognition via segmental empirical output distribution matching [J]. arXiv preprint arXiv:1812.09323, 2018."},{"key":"e_1_3_2_1_5_1","first-page":"27826","article-title":"Unsupervised speech recognition [J]","volume":"34","author":"Baevski A","year":"2021","unstructured":"Baevski A, Hsu W N, Conneau A, Unsupervised speech recognition [J]. Advances in Neural Information Processing Systems, 2021, 34: 27826-27839.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_6_1","volume-title":"Self-Supervised Speech Representation Learning: A Review [J]. arXiv preprint arXiv:2205.10643","author":"Mohamed A","year":"2022","unstructured":"Mohamed A, Lee H, Borgholt L, Self-Supervised Speech Representation Learning: A Review [J]. arXiv preprint arXiv:2205.10643, 2022."},{"key":"e_1_3_2_1_7_1","volume-title":"Generative adversarial nets [J]. Advances in neural information processing systems","author":"Goodfellow I","year":"2014","unstructured":"Goodfellow I, Pouget-Abadie J, Mirza M, Generative adversarial nets [J]. Advances in neural information processing systems, 2014, 27."},{"key":"e_1_3_2_1_8_1","first-page":"12449","article-title":"A framework for self-supervised learning of speech representations [J]","volume":"33","author":"Baevski A","year":"2020","unstructured":"Baevski A, Zhou Y, Mohamed A, wav2vec 2.0: A framework for self-supervised learning of speech representations [J]. Advances in Neural Information Processing Systems, 2020, 33: 12449-12460.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_9_1","volume-title":"HuBERT: Self-supervised speech representation learning by masked prediction of hidden units [J]","author":"Hsu W N","year":"2021","unstructured":"Hsu W N, Bolte B, Tsai Y H H, HuBERT: Self-supervised speech representation learning by masked prediction of hidden units [J]. IEEE\/ACM Transactions on Audio, Speech, and Language Processing, 2021, 29: 3451-3460."},{"key":"e_1_3_2_1_10_1","volume-title":"Analyzing speaker information in self-supervised models to improve zero-resource speech processing [J]. arXiv preprint arXiv: 2108.00917","author":"van Niekerk B","year":"2021","unstructured":"van Niekerk B, Nortje L, Baas M, Analyzing speaker information in self-supervised models to improve zero-resource speech processing [J]. arXiv preprint arXiv: 2108.00917, 2021."},{"key":"e_1_3_2_1_11_1","volume-title":"Representation learning with contrastive predictive coding [J]. arXiv preprint arXiv:1807.03748","author":"Oord A","year":"2018","unstructured":"Oord A, Li Y, Vinyals O. Representation learning with contrastive predictive coding [J]. arXiv preprint arXiv:1807.03748, 2018."},{"key":"e_1_3_2_1_12_1","volume-title":"Attention is all you need [J]. Advances in neural information processing systems","author":"Vaswani A","year":"2017","unstructured":"Vaswani A, Shazeer N, Parmar N, Attention is all you need [J]. Advances in neural information processing systems, 2017, 30."},{"key":"e_1_3_2_1_13_1","volume-title":"Lee H. Tera: Self-supervised learning of transformer encoder representation for speech [J]","author":"Liu A T","year":"2021","unstructured":"Liu A T, Li S W, Lee H. Tera: Self-supervised learning of transformer encoder representation for speech [J]. IEEE\/ACM Transactions on Audio, Speech, and Language Processing, 2021, 29: 2351-2366."},{"key":"e_1_3_2_1_14_1","volume-title":"W2v-bert: Combining contrastive learning and masked language modeling for self-supervised speech pre-training [C]\/\/2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","author":"Chung Y A","year":"2021","unstructured":"Chung Y A, Zhang Y, Han W, W2v-bert: Combining contrastive learning and masked language modeling for self-supervised speech pre-training [C]\/\/2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU). IEEE, 2021: 244-250."},{"key":"e_1_3_2_1_15_1","volume-title":"Unsupervised adaptation with interpretable disentangled representations for distant conversational speech recognition [J]. arXiv preprint arXiv:1806.04872","author":"Hsu W N","year":"2018","unstructured":"Hsu W N, Tang H, Glass J. Unsupervised adaptation with interpretable disentangled representations for distant conversational speech recognition [J]. arXiv preprint arXiv:1806.04872, 2018."},{"key":"e_1_3_2_1_16_1","volume-title":"Melville J. Umap: Uniform manifold approximation and projection for dimension reduction [J]. arXiv preprint arXiv:1802.03426","author":"McInnes L","year":"2018","unstructured":"McInnes L, Healy J, Melville J. Umap: Uniform manifold approximation and projection for dimension reduction [J]. arXiv preprint arXiv:1802.03426, 2018."},{"key":"e_1_3_2_1_17_1","first-page":"5206","article-title":"an asr corpus based on public domain audio books [C]\/\/2015 IEEE international conference on acoustics, speech and signal processing (ICASSP)","volume":"2015","author":"Panayotov V","unstructured":"Panayotov V, Chen G, Povey D, Librispeech: an asr corpus based on public domain audio books [C]\/\/2015 IEEE international conference on acoustics, speech and signal processing (ICASSP). IEEE, 2015: 5206-5210.","journal-title":"IEEE"},{"key":"e_1_3_2_1_18_1","first-page":"7669","article-title":"A benchmark for asr with limited or no supervision [C]\/\/ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","volume":"2020","author":"Kahn J","unstructured":"Kahn J, Rivi\u00e8re M, Zheng W, Libri-light: A benchmark for asr with limited or no supervision [C]\/\/ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 2020: 7669-7673.","journal-title":"IEEE"},{"key":"e_1_3_2_1_19_1","volume-title":"Unsupervised cross-modal alignment of speech and text embedding spaces [J]. Advances in neural information processing systems","author":"Chung Y A","year":"2018","unstructured":"Chung Y A, Weng W H, Tong S, Unsupervised cross-modal alignment of speech and text embedding spaces [J]. Advances in neural information processing systems, 2018, 31."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2017.04.008"},{"key":"e_1_3_2_1_21_1","volume-title":"Self-supervised contrastive learning for unsupervised phoneme segmentation [J]. arXiv preprint arXiv:2007.13465","author":"Kreuk F","year":"2020","unstructured":"Kreuk F, Keshet J, Adi Y. Self-supervised contrastive learning for unsupervised phoneme segmentation [J]. arXiv preprint arXiv:2007.13465, 2020."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/TBDATA.2019.2921572"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.6028\/NIST.IR.4930"}],"event":{"name":"EITCE 2022: 2022 6th International Conference on Electronic Information Technology and Computer Engineering","acronym":"EITCE 2022","location":"Xiamen China"},"container-title":["Proceedings of the 2022 6th International Conference on Electronic Information Technology and Computer Engineering"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3573428.3573659","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3573428.3573659","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T17:49:34Z","timestamp":1750182574000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3573428.3573659"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,10,21]]},"references-count":23,"alternative-id":["10.1145\/3573428.3573659","10.1145\/3573428"],"URL":"https:\/\/doi.org\/10.1145\/3573428.3573659","relation":{},"subject":[],"published":{"date-parts":[[2022,10,21]]},"assertion":[{"value":"2023-03-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}