{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T14:53:39Z","timestamp":1776956019604,"version":"3.51.4"},"reference-count":23,"publisher":"IEEE","license":[{"start":{"date-parts":[[2022,5,23]],"date-time":"2022-05-23T00:00:00Z","timestamp":1653264000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,5,23]],"date-time":"2022-05-23T00:00:00Z","timestamp":1653264000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022,5,23]]},"DOI":"10.1109\/icassp43922.2022.9747085","type":"proceedings-article","created":{"date-parts":[[2022,4,27]],"date-time":"2022-04-27T19:50:34Z","timestamp":1651089034000},"page":"8007-8011","source":"Crossref","is-referenced-by-count":18,"title":["Neufa: Neural Network Based End-to-End Forced Alignment with Bidirectional Attention Mechanism"],"prefix":"10.1109","author":[{"given":"Jingbei","family":"Li","sequence":"first","affiliation":[{"name":"Tsinghua University,Shenzhen International Graduate School,Shenzhen,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yi","family":"Meng","sequence":"additional","affiliation":[{"name":"Tsinghua University,Shenzhen International Graduate School,Shenzhen,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhiyong","family":"Wu","sequence":"additional","affiliation":[{"name":"Tsinghua University,Shenzhen International Graduate School,Shenzhen,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Helen","family":"Meng","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qiao","family":"Tian","sequence":"additional","affiliation":[{"name":"ByteDance,Shanghai,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuping","family":"Wang","sequence":"additional","affiliation":[{"name":"ByteDance,Shanghai,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuxuan","family":"Wang","sequence":"additional","affiliation":[{"name":"ByteDance,Shanghai,China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref10","article-title":"Conformer: Convolution-augmentedTransformerforSpeechRecognition","author":"gulati","year":"2020"},{"key":"ref11","doi-asserted-by":"crossref","DOI":"10.21437\/Interspeech.2017-1452","article-title":"Tacotron: Towards End-to-End Speech Synthesis","author":"wang","year":"2017"},{"key":"ref12","article-title":"Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions","author":"shen","year":"2017"},{"key":"ref13","first-page":"3171","article-title":"FastSpeech: fast, robust and controllable text to speech","author":"ren","year":"2019","journal-title":"Proceedings of the 33rd International Conference on Neural Information Processing Systems"},{"key":"ref14","first-page":"5998","article-title":"Attention is All you Need","author":"vaswani","year":"2017","journal-title":"Advances in Neural IInformation Processing Systems"},{"key":"ref15","article-title":"Neural Machine Translation by Jointly Learning to Align and Translate","author":"bahdanau","year":"2014"},{"key":"ref16","first-page":"7836","article-title":"Unsupervised Speech Decomposition via Triple Information Bottleneck","author":"qian","year":"2020","journal-title":"Proceedings of the 37th International Conference on Machine Learning"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/W14-4012"},{"key":"ref18","doi-asserted-by":"crossref","first-page":"18","DOI":"10.25080\/Majora-7b98e3ed-003","article-title":"librosa: Audio and music signal analysis in python","volume":"8","author":"mcfee","year":"2015","journal-title":"Proceedings of the 14th Python in Science Conference"},{"key":"ref19","article-title":"The Kaldi speech recognition toolkit","author":"povey","year":"2011","journal-title":"2011 IEEE Workshop on Automatic Speech Recognition &amp; Understanding"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1016\/j.wocn.2010.11.006"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1353\/lan.2013.0015"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/MASSP.1986.1165342"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2006-204"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1386"},{"key":"ref7","first-page":"192","article-title":"Prosodylab-aligner: A tool for forced alignment of laboratory speech","volume":"39","author":"gorman","year":"2011","journal-title":"Canadian Acoustics"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1121\/1.4816491"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1016\/j.wocn.2010.11.011"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472621"},{"key":"ref20","article-title":"Pre-trained Grapheme-to-Phoneme (G2P) models","author":"li","year":"2021"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2004.09.001"},{"key":"ref23","first-page":"8024","article-title":"PyTorch: An Imperative Style, High-Performance Deep Learning Library","author":"paszke","year":"2019","journal-title":"Advances in Neural IInformation Processing Systems"}],"event":{"name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","location":"Singapore, Singapore","start":{"date-parts":[[2022,5,23]]},"end":{"date-parts":[[2022,5,27]]}},"container-title":["ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9745891\/9746004\/09747085.pdf?arnumber=9747085","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,8,15]],"date-time":"2022-08-15T20:06:48Z","timestamp":1660594008000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9747085\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,5,23]]},"references-count":23,"URL":"https:\/\/doi.org\/10.1109\/icassp43922.2022.9747085","relation":{},"subject":[],"published":{"date-parts":[[2022,5,23]]}}}