{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T15:27:07Z","timestamp":1775230027958,"version":"3.50.1"},"reference-count":33,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018,11]]},"DOI":"10.1109\/iscslp.2018.8706625","type":"proceedings-article","created":{"date-parts":[[2019,5,20]],"date-time":"2019-05-20T22:58:35Z","timestamp":1558393115000},"page":"210-214","source":"Crossref","is-referenced-by-count":6,"title":["Emphasis Detection for Voice Dialogue Applications Using Multi-channel Convolutional Bidirectional Long Short-Term Memory Network"],"prefix":"10.1109","author":[{"given":"Long","family":"Zhang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jia","family":"Jia","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Fanbo","family":"Meng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Suping","family":"Zhou","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wei","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Cunjun","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Runnan","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2016.7552890"},{"key":"ref32","first-page":"210","article-title":"Use of kernel deep convex networks and end-to-end learning for spoken language understanding","author":"deng","year":"2013","journal-title":"Spoken Language Technology Workshop"},{"key":"ref31","first-page":"4869","article-title":"Voice conversion using deep bidirectional long short-term memory based recurrent neural networks","author":"sun","year":"2015","journal-title":"IEEE International Conference on Acoustics Speech and Signal Processing"},{"key":"ref30","article-title":"Using tilt for automatic emphasis detection with bayesian networks","author":"ning","year":"2015"},{"key":"ref10","article-title":"Synthesizing expressive speech to convey focus using a perturbation model for computer-aided pronunciation training","author":"meng","year":"2010","journal-title":"Second Language Studies Acquisition Learning Education and Technology"},{"key":"ref11","article-title":"Integrating sequence information in the audio-visual detection of word prominence in a human-machine interaction scenario","author":"schnall","year":"2014","journal-title":"Fifteenth Annual Conference of the International Speech Communication Association"},{"key":"ref12","article-title":"An empirical model of emphatic word detection","author":"cernak","year":"2015","journal-title":"IDIAP tech report"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-875"},{"key":"ref14","article-title":"Preserving word-level emphasis in speech-to-speech translation using linear regression hsmms","author":"do","year":"2015","journal-title":"Sixteenth Annual Conference of the International Speech Communication Association"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953231"},{"key":"ref16","doi-asserted-by":"crossref","first-page":"225","DOI":"10.1007\/978-3-662-44851-9_15","article-title":"Optimal thresholding of classifiers to maximize f1 measure","author":"lipton","year":"2014","journal-title":"Proceedings of the European Conference on Machine Learning and Knowledge Discovery in Databases"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2017.276"},{"key":"ref18","article-title":"Notes on convolutional neural networks","author":"bouvrie","year":"2006"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-324"},{"key":"ref28","doi-asserted-by":"crossref","first-page":"18","DOI":"10.25080\/Majora-7b98e3ed-003","article-title":"librosa: Audio and music signal analysis in python","author":"mcfee","year":"2015","journal-title":"Proceedings of the 14th Python in Science Conference"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2016.2643280"},{"key":"ref27","article-title":"Named entity recognition with bidirectional lstm-cnns","author":"chiu","year":"2015","journal-title":"arXiv preprint arXiv 1511 08308"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-896"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953210"},{"key":"ref29","article-title":"A library for support vector machines","author":"chang","year":"2007"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6639303"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2009.4960646"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2007.907570"},{"key":"ref2","article-title":"Multi-task deep learning for user intention understanding in speech interaction systems","author":"an","year":"2017"},{"key":"ref9","first-page":"7704","article-title":"Lexical stress classification for language learning using spectral and segmental features","author":"ferrer","year":"2014","journal-title":"IEEE International Conference on Acoustics Speech and Signal Processing"},{"key":"ref1","article-title":"Inferring emotion from conversational voice data: A semi-supervised multi-path generative neural network approach","author":"zhou","year":"2018"},{"key":"ref20","article-title":"A c-lstm neural network for text classification","author":"zhou","year":"2015","journal-title":"arXiv preprint arXiv 1511 08630"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/78.650093"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P16-2037"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2016.7477625"},{"key":"ref23","article-title":"Tts synthesis with bidirectional lstm based recurrent neural networks","author":"fan","year":"2014","journal-title":"Fifteenth Annual Conference of the International Speech Communication Association"},{"key":"ref26","first-page":"115","article-title":"Learning precise timing with lstm recurrent networks","volume":"3","author":"gers","year":"2002","journal-title":"Journal of Machine Learning Research"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN.2005.1556215"}],"event":{"name":"2018 11th International Symposium on Chinese Spoken Language Processing (ISCSLP)","location":"Taipei City, Taiwan","start":{"date-parts":[[2018,11,26]]},"end":{"date-parts":[[2018,11,29]]}},"container-title":["2018 11th International Symposium on Chinese Spoken Language Processing (ISCSLP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/8701133\/8706262\/08706625.pdf?arnumber=8706625","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,8,23]],"date-time":"2020-08-23T23:09:50Z","timestamp":1598224190000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8706625\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,11]]},"references-count":33,"URL":"https:\/\/doi.org\/10.1109\/iscslp.2018.8706625","relation":{},"subject":[],"published":{"date-parts":[[2018,11]]}}}