{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T07:01:19Z","timestamp":1761894079297,"version":"build-2065373602"},"publisher-location":"Singapore","reference-count":33,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819527243","type":"print"},{"value":"9789819527250","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,11,1]],"date-time":"2025-11-01T00:00:00Z","timestamp":1761955200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,11,1]],"date-time":"2025-11-01T00:00:00Z","timestamp":1761955200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-2725-0_1","type":"book-chapter","created":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T05:18:49Z","timestamp":1761887929000},"page":"3-16","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Self-supervised Contrastive Learning for\u00a0Content-Centric Speech Representation"],"prefix":"10.1007","author":[{"given":"Jinlong","family":"Li","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ling","family":"Dong","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wenjun","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhengtao","family":"Yu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shengxiang","family":"Gao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,11,1]]},"reference":[{"key":"1_CR1","doi-asserted-by":"publisher","first-page":"3451","DOI":"10.1109\/TASLP.2021.3122291","volume":"29","author":"WN Hsu","year":"2021","unstructured":"Hsu, W.N., Bolte, B., Tsai, Y.H.H., Lakhotia, K., Salakhutdinov, R., Mohamed, A.: Hubert: self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM trans. audio, speech lang. proc. 29, 3451\u20133460 (2021)","journal-title":"IEEE\/ACM trans. audio, speech lang. proc."},{"issue":"6","key":"1_CR2","doi-asserted-by":"publisher","first-page":"1505","DOI":"10.1109\/JSTSP.2022.3188113","volume":"16","author":"S Chen","year":"2022","unstructured":"Chen, S., et al.: Wavlm: large-scale self-supervised pre-training for full stack speech processing. IEEE J. Sel. Topics Sig. Proc. 16(6), 1505\u20131518 (2022)","journal-title":"IEEE J. Sel. Topics Sig. Proc."},{"key":"1_CR3","doi-asserted-by":"crossref","unstructured":"Evain, S., et\u00a0al.: Lebenchmark: a reproducible framework for assessing self-supervised representation learning from speech. arXiv preprint arXiv:2104.11462 (2021)","DOI":"10.21437\/Interspeech.2021-556"},{"key":"1_CR4","doi-asserted-by":"crossref","unstructured":"Chang, X., et\u00a0al.: An exploration of self-supervised pretrained representations for end-to-end speech recognition. In: 2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU), pp. 228\u2013235. IEEE (2021)","DOI":"10.1109\/ASRU51503.2021.9688137"},{"key":"1_CR5","doi-asserted-by":"crossref","unstructured":"Chan, D.M., Ghosh, S.: Content-context factorized representations for automated speech recognition. arXiv preprint arXiv:2205.09872 (2022)","DOI":"10.21437\/Interspeech.2022-390"},{"key":"1_CR6","doi-asserted-by":"crossref","unstructured":"Wang, F., Liu, H.: Understanding the behaviour of contrastive loss. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 2495\u20132504 (2021)","DOI":"10.1109\/CVPR46437.2021.00252"},{"key":"1_CR7","unstructured":"Wang, T., Isola, P.: Understanding contrastive representation learning through alignment and uniformity on the hypersphere. In: International Conference on Machine Learning, pp. 9929\u20139939. PMLR (2020)"},{"key":"1_CR8","unstructured":"Qian, K., Zhang, Y., Gao, H., Ni, J., Lai, C.I., Cox, D., Hasegawa-Johnson, M., Chang, S.: Contentvec: an improved self-supervised speech representation by disentangling speakers. In: International Conference on Machine Learning, pp. 18003\u201318017. PMLR (2022)"},{"key":"1_CR9","doi-asserted-by":"crossref","unstructured":"Peyser, C., Sainath, R.H.A.R.T.N., Picheny, M., Cho, K.: Towards disentangled speech representations. arXiv preprint arXiv:2208.13191 (2022)","DOI":"10.21437\/Interspeech.2022-30"},{"key":"1_CR10","doi-asserted-by":"crossref","unstructured":"Chang, H.J., Liu, A.H., Glass, J.: Self-supervised fine-tuning for improved content representations by speaker-invariant clustering. arXiv preprint arXiv:2305.11072 (2023)","DOI":"10.21437\/Interspeech.2023-847"},{"key":"1_CR11","doi-asserted-by":"crossref","unstructured":"Panayotov, V., Chen, G., Povey, D., Khudanpur, S.: Librispeech: an ASR corpus based on public domain audio books. In: 2015 IEEE international conference on acoustics, speech and signal processing (ICASSP), pp. 5206\u20135210. IEEE (2015)","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"1_CR12","doi-asserted-by":"crossref","unstructured":"Meghanani, A., Hain, T.: Score: Self-supervised correspondence fine-tuning for improved content representations. In: ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 12086\u201312090. IEEE (2024)","DOI":"10.1109\/ICASSP48485.2024.10448060"},{"key":"1_CR13","doi-asserted-by":"crossref","unstructured":"Meghanani, A., Hain, T.: Laser: Learning by aligning self-supervised representations of speech for improving content-related tasks. arXiv preprint arXiv:2406.09153 (2024)","DOI":"10.21437\/Interspeech.2024-1824"},{"key":"1_CR14","unstructured":"Yang, S.w., et\u00a0al.: Superb: speech processing universal performance benchmark. arXiv preprint arXiv:2105.01051 (2021)"},{"key":"1_CR15","unstructured":"Hinton, G.: Improving neural networks by preventing co-adaptation of feature detectors. arXiv preprint arXiv:1207.0580 (2012)"},{"key":"1_CR16","doi-asserted-by":"publisher","first-page":"198","DOI":"10.1016\/j.neunet.2022.12.012","volume":"159","author":"H Pan","year":"2023","unstructured":"Pan, H., Guo, Y., Deng, Q., Yang, H., Chen, J., Chen, Y.: Improving fine-tuning of self-supervised models with contrastive initialization. Neural Netw. 159, 198\u2013207 (2023)","journal-title":"Neural Netw."},{"key":"1_CR17","unstructured":"Gunel, B., Du, J., Conneau, A., Stoyanov, V.: Supervised contrastive learning for pre-trained language model fine-tuning. arXiv preprint arXiv:2011.01403 (2020)"},{"key":"1_CR18","first-page":"29848","volume":"34","author":"Y Zhang","year":"2021","unstructured":"Zhang, Y., Hooi, B., Hu, D., Liang, J., Feng, J.: Unleashing the power of contrastive self-supervised visual models via contrast-regularized fine-tuning. Adv. Neural. Inf. Process. Syst. 34, 29848\u201329860 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1_CR19","doi-asserted-by":"crossref","unstructured":"Ko, T., Peddinti, V., Povey, D., Khudanpur, S.: Audio augmentation for speech recognition. In: Interspeech. vol.\u00a02015, p.\u00a03586 (2015)","DOI":"10.21437\/Interspeech.2015-711"},{"key":"1_CR20","doi-asserted-by":"crossref","unstructured":"Yang, Y.Y., et\u00a0al.: Torchaudio: building blocks for audio and speech processing. In: ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6982\u20136986. IEEE (2022)","DOI":"10.1109\/ICASSP43922.2022.9747236"},{"key":"1_CR21","first-page":"16251","volume":"34","author":"HS Choi","year":"2021","unstructured":"Choi, H.S., Lee, J., Kim, W., Lee, J., Heo, H., Lee, K.: Neural analysis and synthesis: reconstructing speech from self-supervised representations. Adv. Neural. Inf. Process. Syst. 34, 16251\u201316265 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1_CR22","doi-asserted-by":"crossref","unstructured":"Eide, E., Gish, H.: A parametric approach to vocal tract length normalization. In: 1996 IEEE International Conference on Acoustics, Speech, and Signal Processing Conference Proceedings. vol.\u00a01, pp. 346\u2013348. IEEE (1996)","DOI":"10.1109\/ICASSP.1996.541103"},{"key":"1_CR23","unstructured":"Cuturi, M.: Sinkhorn distances: Lightspeed computation of optimal transport. Adv. Neural Inf. Proc. Syst. 26 (2013)"},{"key":"1_CR24","doi-asserted-by":"crossref","unstructured":"Anguera, X., Rodriguez-Fuentes, L.J., Buzo, A., Metze, F., Sz\u00f6ke, I., Penagarikano, M.: Quesst2014: evaluating query-by-example speech search in a zero-resource setting with real-life queries. In: 2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5833\u20135837. IEEE (2015)","DOI":"10.1109\/ICASSP.2015.7179090"},{"key":"1_CR25","unstructured":"Warden, P.: Speech commands: a dataset for limited-vocabulary speech recognition. ArXiv e-prints (Apr 2018), https:\/\/arxiv.org\/abs\/1804.03209"},{"key":"1_CR26","doi-asserted-by":"crossref","unstructured":"Lugosch, L., Ravanelli, M., Ignoto, P., Tomar, V.S., Bengio, Y.: Speech model pre-training for end-to-end spoken language understanding. arXiv preprint arXiv:1904.03670 (2019)","DOI":"10.21437\/Interspeech.2019-2396"},{"key":"1_CR27","unstructured":"Coucke, A., et\u00a0al.: Snips voice platform: an embedded spoken language understanding system for private-by-design voice interfaces. arXiv preprint arXiv:1805.10190 (2018)"},{"key":"1_CR28","doi-asserted-by":"crossref","unstructured":"Nagrani, A., Chung, J.S., Zisserman, A.: Voxceleb: a large-scale speaker identification dataset. arXiv preprint arXiv:1706.08612 (2017)","DOI":"10.21437\/Interspeech.2017-950"},{"key":"1_CR29","doi-asserted-by":"crossref","unstructured":"Imambi, S., Prakash, K.B., Kanagachidambaresan, G.: Pytorch. programming with tensorflow: solution for edge computing applications, pp. 87\u2013104 (2021)","DOI":"10.1007\/978-3-030-57077-4_10"},{"key":"1_CR30","unstructured":"Baevski, A., Zhou, Y., Mohamed, A., Auli, M.: wav2vec 2.0: a framework for self-supervised learning of speech representations. Adv. Neural Inf. Proc. Syst. 33, 12449\u201312460 (2020)"},{"key":"1_CR31","unstructured":"Baevski, A., Hsu, W.N., Xu, Q., Babu, A., Gu, J., Auli, M.: Data2vec: a general framework for self-supervised learning in speech, vision and language. In: International Conference on Machine Learning, pp. 1298\u20131312. PMLR (2022)"},{"issue":"6","key":"1_CR32","first-page":"90","volume":"1","author":"TM Kodinariya","year":"2013","unstructured":"Kodinariya, T.M., et al.: Review on determining number of cluster in k-means clustering. Int. J. 1(6), 90\u201395 (2013)","journal-title":"Int. J."},{"issue":"1","key":"1_CR33","doi-asserted-by":"publisher","first-page":"100","DOI":"10.1038\/s43586-022-00184-w","volume":"2","author":"M Greenacre","year":"2022","unstructured":"Greenacre, M., Groenen, P.J., Hastie, T., d\u2019Enza, A.I., Markos, A., Tuzhilina, E.: Principal component analysis. Nature Rev. Meth. Primers 2(1), 100 (2022)","journal-title":"Nature Rev. Meth. Primers"}],"container-title":["Lecture Notes in Computer Science","Chinese Computational Linguistics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-2725-0_1","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T05:18:58Z","timestamp":1761887938000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-2725-0_1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,1]]},"ISBN":["9789819527243","9789819527250"],"references-count":33,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-2725-0_1","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11,1]]},"assertion":[{"value":"1 November 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"CCL","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China National Conference on Chinese Computational Linguistics","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Jinan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 August 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 August 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"24","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"cncl2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/link.springer.com\/conference\/cncl","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}