{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,3]],"date-time":"2026-06-03T14:43:50Z","timestamp":1780497830960,"version":"3.54.1"},"reference-count":47,"publisher":"IEEE","license":[{"start":{"date-parts":[[2019,12,1]],"date-time":"2019-12-01T00:00:00Z","timestamp":1575158400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2019,12,1]],"date-time":"2019-12-01T00:00:00Z","timestamp":1575158400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2019,12,1]],"date-time":"2019-12-01T00:00:00Z","timestamp":1575158400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2019,12]]},"DOI":"10.1109\/asru46091.2019.9004009","type":"proceedings-article","created":{"date-parts":[[2020,2,21]],"date-time":"2020-02-21T07:01:33Z","timestamp":1582268493000},"page":"31-38","source":"Crossref","is-referenced-by-count":29,"title":["Simultaneous Speech Recognition and Speaker Diarization for Monaural Dialogue Recordings with Target-Speaker Acoustic Models"],"prefix":"10.1109","author":[{"given":"Naoyuki","family":"Kanda","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shota","family":"Horiguchi","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yusuke","family":"Fujita","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yawen","family":"Xue","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Kenji","family":"Nagamatsu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shinji","family":"Watanabe","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref39","article-title":"Vocal tract length perturbation (VTLP) improves speech recognition","volume":"117","author":"jaitly","year":"2013","journal-title":"ICML Workshop on Deep Learning for Audio Speech and Language Processing"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9003959"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1893"},{"key":"ref32","article-title":"The Kaldi speech recognition toolkit","author":"povey","year":"2011","journal-title":"Proc ASRU"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1323"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref37","article-title":"End-to-end neural speaker di-arization with permutation-free objectives","author":"fujita","year":"2019","journal-title":"Proc Inter-speech"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461375"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462665"},{"key":"ref34","first-page":"4930","article-title":"Speaker diarization using deep neural network embeddings","author":"garcia-romero","year":"2017","journal-title":"Proc ICASSP"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462628"},{"key":"ref40","first-page":"309","article-title":"Elas-tic spectral distortion for low resource speech recognition with deep neural networks","author":"kanda","year":"2013","journal-title":"Proc ASRU"},{"key":"ref11","first-page":"31","article-title":"Deep clustering: Discriminative embeddings for segmentation and separation","author":"john","year":"2016","journal-title":"Proc ICASSP"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952155"},{"key":"ref13","first-page":"184","article-title":"Progres-sive joint modeling in unsupervised single-channel overlapped speech recognition","volume":"26","author":"chen","year":"2018","journal-title":"IEEE Trans on ASLP"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461893"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1244"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682822"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952154"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-667"},{"key":"ref19","first-page":"788","article-title":"Front-end factor analysis for speaker verification","volume":"19","author":"najim","year":"2011","journal-title":"IEEE Trans on ASLP"},{"key":"ref28","article-title":"Benchmark test for speech recognition using the Corpus of Spontaneous Japanese","author":"kawahara","year":"2003","journal-title":"ISCA & IEEE Workshop on Spontaneous Speech Processing and Recognition"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-305"},{"key":"ref27","article-title":"Analysis of overlaps in meetings by dialog factors, hot spots, speakers, and collection site: insights for automatic speech recognition","author":"cetin","year":"2006","journal-title":"Proc IC-SLP"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-2284"},{"key":"ref6","author":"kanda","year":"2019","journal-title":"Auxiliary interference speaker loss for target-speaker speech recognition"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/29.21701"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462661"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2014.7078610"},{"key":"ref7","article-title":"LIUM SpkDiarization: an open source toolkit for diarization","author":"meignier","year":"2010","journal-title":"CMU SPUD Workshop"},{"key":"ref2","first-page":"373","article-title":"The Rich Transcription 2007 meeting recognition evaluation","author":"jonathan","year":"0","journal-title":"Multimodal Technologies for Perception of Humans"},{"key":"ref9","first-page":"217","article-title":"A study of the cosine distance-based mean shift for telephone speech diarization","volume":"22","author":"senoussaoui","year":"2013","journal-title":"IEEE Trans on ASLP"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2006.878256"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-79"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.21437\/CHiME.2018-2"},{"key":"ref45","first-page":"69","article-title":"In-vestigation of lattice-free maximum mutual information-based acoustic models with sequence-level Kullback-Leibler divergence","author":"kanda","year":"2017","journal-title":"Proc ASRU"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2017.8268910"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683664"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682556"},{"key":"ref42","first-page":"5220","article-title":"A study on data augmentation of reverberant speech for robust speech recognition","author":"tom","year":"2017","journal-title":"Proc ICASSP"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-595"},{"key":"ref41","first-page":"3586","article-title":"Audio augmentation for speech recognition","author":"ko","year":"2015","journal-title":"Proc INTERSPEECH"},{"key":"ref23","first-page":"55","article-title":"Speaker adaptation of neural network acoustic models using i-vectors","author":"george","year":"2013","journal-title":"Proc ASRU"},{"key":"ref44","first-page":"1915","article-title":"Ensemble deep learning for speech recognition","author":"li","year":"2014","journal-title":"Proc INTERSPEECH"},{"key":"ref26","article-title":"Corpus of spontaneous japanese: Its design and evaluation","author":"maekawa","year":"2003","journal-title":"ISCA & IEEE Workshop on Spontaneous Speech Processing and Recognition"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2013.6707703"},{"key":"ref25","first-page":"3214","article-title":"A time delay neural network architecture for efficient modeling of long temporal contexts","author":"peddinti","year":"2015","journal-title":"Proc INTERSPEECH"}],"event":{"name":"2019 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","location":"SG, Singapore","start":{"date-parts":[[2019,12,14]]},"end":{"date-parts":[[2019,12,18]]}},"container-title":["2019 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/8985378\/9003727\/09004009.pdf?arnumber=9004009","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,7,18]],"date-time":"2022-07-18T14:51:19Z","timestamp":1658155879000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9004009\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,12]]},"references-count":47,"URL":"https:\/\/doi.org\/10.1109\/asru46091.2019.9004009","relation":{},"subject":[],"published":{"date-parts":[[2019,12]]}}}