{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T14:03:47Z","timestamp":1730297027893,"version":"3.28.0"},"reference-count":23,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018,12]]},"DOI":"10.1109\/slt.2018.8639528","type":"proceedings-article","created":{"date-parts":[[2019,2,14]],"date-time":"2019-02-14T23:36:34Z","timestamp":1550187394000},"page":"648-655","source":"Crossref","is-referenced-by-count":8,"title":["Multi-Scale Alignment and Contextual History for Attention Mechanism in Sequence-to-Sequence Model"],"prefix":"10.1109","author":[{"given":"Andros","family":"Tjandra","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sakriani","family":"Sakti","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Satoshi","family":"Nakamura","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref10","doi-asserted-by":"crossref","DOI":"10.21437\/Interspeech.2017-1452","article-title":"Tacotron: Towards end-to-end speech synthesis","author":"wang","year":"2017"},{"key":"ref11","doi-asserted-by":"crossref","first-page":"301","DOI":"10.1109\/ASRU.2017.8268950","article-title":"Listening while speaking: Speech chain by deep learning","author":"tjandra","year":"2017","journal-title":"Automatic Speech Recognition and Understanding (ASRU) 2017 IEEE Workshop on"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1558"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.3115\/1075527.1075614"},{"key":"ref14","article-title":"The Kaldi speech recognition toolkit","author":"povey","year":"2011","journal-title":"2011 IEEE Workshop on Automatic Speech Recognition &amp; Understanding"},{"article-title":"First-pass large vocabulary continuous speech recognition using bi-directional recurrent DNNs","year":"2014","author":"hannun","key":"ref15"},{"article-title":"Empirical evaluation of rectified activations in convolutional network","year":"2015","author":"xu","key":"ref16"},{"key":"ref17","doi-asserted-by":"crossref","DOI":"10.1007\/978-3-642-24797-2","volume":"385","author":"graves","year":"2012","journal-title":"Supervised Sequence Labelling with Recurrent Neural Networks"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472618"},{"article-title":"Adam: A method for stochastic optimization","year":"2014","author":"kingma","key":"ref19"},{"key":"ref4","first-page":"577","article-title":"Attention-based models for speech recognition","author":"chorowski","year":"2015","journal-title":"Advances in neural information processing systems"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472621"},{"key":"ref6","first-page":"431","article-title":"Local monotonic attention mechanism for end-to-end speech and language processing","author":"tjandra","year":"2017","journal-title":"Proceedings of the Eighth International Joint Conference on Natural Language Processing (Volume 1 Long Papers)"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D15-1166"},{"key":"ref8","first-page":"2048","article-title":"Show, attend and tell: Neural image caption generation with visual attention","author":"xu","year":"2015","journal-title":"International Conference on Machine Learning"},{"key":"ref7","first-page":"2837","article-title":"Online and linear-time attention by enforcing monotonic alignments","author":"raffel","year":"2017","journal-title":"International Conference on Machine Learning"},{"article-title":"Neural machine translation by jointly learning to align and translate","year":"2014","author":"bahdanau","key":"ref2"},{"key":"ref1","first-page":"3104","article-title":"Sequence to sequence learning with neural networks","author":"sutskever","year":"2014","journal-title":"Advances in neural information processing systems"},{"journal-title":"Toward expressive speech translation a unified sequence-to-sequence LSTMs approach for translating words and emphasis","year":"2017","author":"do","key":"ref9"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953075"},{"year":"2017","key":"ref22","article-title":"The LJ speech dataset"},{"key":"ref21","first-page":"309","article-title":"Attention-based wav2text with feature transfer learning","author":"tjandra","year":"2017","journal-title":"2017 IEEE Automatic Speech Recognition and Understanding Workshop ASRU 2017 Okinawa Japan December 16&#x2013;20 2017"},{"journal-title":"librosa Audio and music signal analysis in python","year":"2015","author":"mcfee","key":"ref23"}],"event":{"name":"2018 IEEE Spoken Language Technology Workshop (SLT)","start":{"date-parts":[[2018,12,18]]},"location":"Athens, Greece","end":{"date-parts":[[2018,12,21]]}},"container-title":["2018 IEEE Spoken Language Technology Workshop (SLT)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/8632666\/8639030\/08639528.pdf?arnumber=8639528","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,1,27]],"date-time":"2022-01-27T08:12:49Z","timestamp":1643271169000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8639528\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,12]]},"references-count":23,"URL":"https:\/\/doi.org\/10.1109\/slt.2018.8639528","relation":{},"subject":[],"published":{"date-parts":[[2018,12]]}}}