{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,7,27]],"date-time":"2025-07-27T07:45:28Z","timestamp":1753602328934,"version":"3.28.0"},"reference-count":40,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,1,9]],"date-time":"2023-01-09T00:00:00Z","timestamp":1673222400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,1,9]],"date-time":"2023-01-09T00:00:00Z","timestamp":1673222400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,1,9]]},"DOI":"10.1109\/slt54892.2023.10023257","type":"proceedings-article","created":{"date-parts":[[2023,1,27]],"date-time":"2023-01-27T18:54:03Z","timestamp":1674845643000},"page":"677-684","source":"Crossref","is-referenced-by-count":4,"title":["Four-in-One: a Joint Approach to Inverse Text Normalization, Punctuation, Capitalization, and Disfluency for Automatic Speech Recognition"],"prefix":"10.1109","author":[{"given":"Sharman","family":"Tan","sequence":"first","affiliation":[{"name":"Microsoft Corporation"}]},{"given":"Piyush","family":"Behre","sequence":"additional","affiliation":[{"name":"Microsoft Corporation"}]},{"given":"Nick","family":"Kibre","sequence":"additional","affiliation":[{"name":"Microsoft Corporation"}]},{"given":"Issac","family":"Alphonso","sequence":"additional","affiliation":[{"name":"Microsoft Corporation"}]},{"given":"Shuangyu","family":"Chang","sequence":"additional","affiliation":[{"name":"Microsoft Corporation"}]}],"member":"263","reference":[{"key":"ref1","first-page":"198","article-title":"Formatting time-aligned asr tran-scripts for readability","volume-title":"Human Language Technolo-gies: The 2010 Annual Conference of the North Ameri-can Chapter of the Association for Computational Lin-guistics","author":"Shugrina"},{"key":"ref2","first-page":"152","article-title":"Truecasing","volume-title":"Proceedings of the 41st Annual Meeting of the Association for Computational Linguistics","author":"Lita"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2006.326816"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2008.4518807"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2008.4518784"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2006.1660187"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W19-2204"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.nlpmc-1.8"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-3074"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1017\/S1351324914000175"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/n19-2024"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1145\/3557894"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9687976"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053159"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.21437\/Eurospeech.1997-626"},{"key":"ref17","first-page":"654","article-title":"Punctuation prediction for unsegmented tran-script based on word vector","volume-title":"Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC16)","author":"Che"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-3134"},{"journal-title":"Openwebtext cor-pus","year":"2019","author":"Gokaslan","key":"ref19"},{"key":"ref20","article-title":"Opensubtitles2016: Extracting large parallel corpora from movie and tv subtitles","author":"Lison","year":"2016","journal-title":"European Language Resources Association"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3340531.3412879"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1992.225858"},{"key":"ref23","first-page":"69","article-title":"The fisher corpus: A resource for the next generations of speech-to-text","volume":"4","author":"Cieri","year":"2004","journal-title":"LREC"},{"volume-title":"Switchboard SWBD-DAMSL shallow-discourse-function annotation coders manual, draft 13","year":"1997","author":"Jurafsky","key":"ref24"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.346"},{"key":"ref26","first-page":"4171","article-title":"BERT: Pre-training of deep bidi-rectional transformers for language understanding","volume-title":"Proceedings of the 2019 Conference of the North Amer-ican Chapter of the Association for Computational Lin-guistics: Human Language Technologies, Volume 1 (Long and Short Papers)","author":"Devlin"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1177\/002383099804100410"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1162\/089120100561737"},{"key":"ref29","article-title":"Teaching machines to read and com-prehend","volume":"28","author":"Hermann","year":"2015","journal-title":"Advances in neural information processing systems"},{"key":"ref30","article-title":"Rnn approaches to text normalization: A challenge","author":"Sproat","year":"2016","journal-title":"arXiv preprint"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2012.02.003"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414912"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2009.4960690"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2006.878255"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.21437\/ICSLP.2002-307"},{"key":"ref36","first-page":"177","article-title":"Better punctuation prediction with dynamic conditional random fields","volume-title":"Proceedings of the 2010 conference on empirical methods in natural language processing","author":"Lu"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1079"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746492"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-1247"},{"key":"ref40","first-page":"278","article-title":"A neural attention model for disfluency detection","volume-title":"Proceedings of COLING 2016, the 26th International Conference on Computational Linguistics","author":"Wang"}],"event":{"name":"2022 IEEE Spoken Language Technology Workshop (SLT)","start":{"date-parts":[[2023,1,9]]},"location":"Doha, Qatar","end":{"date-parts":[[2023,1,12]]}},"container-title":["2022 IEEE Spoken Language Technology Workshop (SLT)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10022052\/10022330\/10023257.pdf?arnumber=10023257","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,2,13]],"date-time":"2024-02-13T08:37:28Z","timestamp":1707813448000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10023257\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,1,9]]},"references-count":40,"URL":"https:\/\/doi.org\/10.1109\/slt54892.2023.10023257","relation":{},"subject":[],"published":{"date-parts":[[2023,1,9]]}}}