{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T19:18:18Z","timestamp":1776885498850,"version":"3.51.2"},"reference-count":79,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"6","license":[{"start":{"date-parts":[[2022,10,1]],"date-time":"2022-10-01T00:00:00Z","timestamp":1664582400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"Guangdong Provincial Key Laboratory of Big Data Computing"},{"DOI":"10.13039\/501100004853","name":"Chinese University of Hong Kong","doi-asserted-by":"publisher","award":["B10120210117-KP02"],"award-info":[{"award-number":["B10120210117-KP02"]}],"id":[{"id":"10.13039\/501100004853","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE J. Sel. Top. Signal Process."],"published-print":{"date-parts":[[2022,10]]},"DOI":"10.1109\/jstsp.2022.3191845","type":"journal-article","created":{"date-parts":[[2022,7,18]],"date-time":"2022-07-18T20:38:10Z","timestamp":1658176690000},"page":"1367-1379","source":"Crossref","is-referenced-by-count":9,"title":["Self-Supervised Learning With Segmental Masking for Speech Representation"],"prefix":"10.1109","volume":"16","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3527-6034","authenticated-orcid":false,"given":"Xianghu","family":"Yue","sequence":"first","affiliation":[{"name":"Department of Electrical and Computer Engineering, National University of Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jingru","family":"Lin","sequence":"additional","affiliation":[{"name":"Department of Electrical and Computer Engineering, National University of Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Fabian Ritter","family":"Gutierrez","sequence":"additional","affiliation":[{"name":"Department of Electrical and Computer Engineering, National University of Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Haizhou","family":"Li","sequence":"additional","affiliation":[{"name":"School of Data Science, The Chinese University of Hong Kong, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1126\/science.1113530"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1016\/j.neuron.2010.08.038"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1016\/S0010-0277(87)80002-1"},{"key":"ref4","first-page":"12449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume-title":"Proc. 33th Conf. Neural Inf. Process. Syst","author":"Baevski","year":"2020"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3095662"},{"key":"ref6","article-title":"Representation learning with contrastive predictive coding","author":"Oord","year":"2018"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053569"},{"key":"ref8","article-title":"Augmentation adversarial training for self-supervised speaker recognition","volume-title":"Proc. 34th Conf. Neural Inf. Process. Syst","author":"Huh","year":"2020"},{"key":"ref9","first-page":"6829","article-title":"Disentangled speech embeddings using cross-modal self-supervision","volume-title":"Proc. IEEE Int. Conf. Acoust., Speech Signal Process","author":"Arsha","year":"2020"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-3094"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1835"},{"key":"ref12","article-title":"Self-supervised learning for speech enhancement","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Wang","year":"2020"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1868"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2938863"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414321"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2019-1473"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054438"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054548"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.106"},{"key":"ref20","first-page":"4171","article-title":"BERT:Pre-training of deep bidirectional transformers for language understanding","volume-title":"Proc. North Amer. Chapter Assoc. Comput. Linguist.","author":"Devlin","year":"2019"},{"key":"ref21","article-title":"Roberta: A robustly optimized bert pretraining approach","author":"Liu","year":"2019"},{"key":"ref22","article-title":"Improving transformer-based speech recognition using unsupervised pre-training","author":"Jiang","year":"2019"},{"key":"ref23","article-title":"Masked pre-trained encoder base on joint ctc-transformer","author":"Liu","year":"2020"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054458"},{"key":"ref25","first-page":"6000","article-title":"Attention is all you need","volume-title":"Proc. 31th Conf. Neural Inf. Process. Syst.","author":"Vaswani","year":"2018"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383575"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/icassp39728.2021.9414539"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00300"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-905"},{"key":"ref30","first-page":"1","article-title":"Very deep convolutional networks for large-scale image recognition","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Simonyan","year":"2015"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298958"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6638947"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2017.2763455"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2014.2339736"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1146\/annurev.neuro.26.041002.131047"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/d14-1179"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/tkde.2021.3090866"},{"key":"ref38","first-page":"1597","article-title":"A simple framework for contrastive learning of visual representations","volume-title":"Proc. Int. Conf. Mach. Learn","author":"Chen","year":"2020"},{"key":"ref39","first-page":"9729","article-title":"Momentum contrast for unsupervised visual representation learning","volume-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit","author":"Kaiming","year":"2020"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N18-1202"},{"key":"ref41","first-page":"5753","article-title":"XlNet: Generalized autoregressive pretraining for language understanding","volume-title":"Proc. 33th Conf. Neural Inf. Process. Syst.","author":"Yang","year":"2019"},{"key":"ref42","first-page":"1","article-title":"Albert: A lite BERT for self-supervised learning of language representations","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Lan","year":"2019"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1873"},{"key":"ref44","first-page":"1","article-title":"vq-wav2vec: Self-supervised learning of discrete speech representations","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Baevski","year":"2020"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688253"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.213"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1228"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053176"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/icassp40776.2020.9053176"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1511"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2605"},{"key":"ref52","first-page":"8089","article-title":"Phoneme boundary detection using learnable segmental features","volume-title":"Proc. IEEE Int. Conf. Acoust., Speech Signal Process","author":"Felix","year":"2020"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2020-2398"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-877"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1308"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1874"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1544"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/icassp43922.2022.9746102"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-236"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1386"},{"key":"ref61","article-title":"Layer normalization","author":"Ba","year":"2016"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-1775"},{"key":"ref63","first-page":"27826","article-title":"Unsupervised speech recognition","volume-title":"Proc. 34th Conf. Neural Inf. Process. Syst","author":"Baevski","year":"2021"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref65","first-page":"4211","article-title":"Common voice: A massively-multilingual speech corpus","volume-title":"Proc. Lang. Resour. Eval. Conf.","author":"Ardila","year":"2020"},{"key":"ref66","first-page":"1","article-title":"Adam: A method for stochastic optimization","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Kingma","year":"2015"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2016-595"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.3115\/1075527.1075614"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683713"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/29.46546"},{"key":"ref71","article-title":"Speech commands: A public dataset for single-word speech recognition","author":"Warden","year":"2017"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2019.101027"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2396"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1007\/s10579-008-9076-6"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1186\/s13636-015-0068-3"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462015"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/TETCI.2017.2762739"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1006\/csla.1998.0043"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"}],"container-title":["IEEE Journal of Selected Topics in Signal Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/4200690\/9923627\/09832492.pdf?arnumber=9832492","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,2,1]],"date-time":"2024-02-01T06:42:59Z","timestamp":1706769779000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9832492\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,10]]},"references-count":79,"journal-issue":{"issue":"6"},"URL":"https:\/\/doi.org\/10.1109\/jstsp.2022.3191845","relation":{},"ISSN":["1932-4553","1941-0484"],"issn-type":[{"value":"1932-4553","type":"print"},{"value":"1941-0484","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,10]]}}}