{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T11:38:07Z","timestamp":1757590687955,"version":"3.40.3"},"publisher-location":"Cham","reference-count":20,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783030042202"},{"type":"electronic","value":"9783030042219"}],"license":[{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018]]},"DOI":"10.1007\/978-3-030-04221-9_19","type":"book-chapter","created":{"date-parts":[[2018,11,16]],"date-time":"2018-11-16T12:32:16Z","timestamp":1542371536000},"page":"210-220","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":32,"title":["A Comparison of Modeling Units in Sequence-to-Sequence Speech Recognition with the Transformer on Mandarin Chinese"],"prefix":"10.1007","author":[{"given":"Shiyu","family":"Zhou","sequence":"first","affiliation":[]},{"given":"Linhao","family":"Dong","sequence":"additional","affiliation":[]},{"given":"Shuang","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Bo","family":"Xu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2018,11,17]]},"reference":[{"key":"19_CR1","doi-asserted-by":"crossref","unstructured":"Chan, W., Lane, I.: On online attention-based speech recognition and joint Mandarin character-pinyin training. In: Interspeech, pp. 3404\u20133408 (2016)","DOI":"10.21437\/Interspeech.2016-334"},{"key":"19_CR2","unstructured":"Chiu, C.C., et al.: State-of-the-art speech recognition with sequence-to-sequence models. arXiv preprint arXiv:1712.01769 (2017)"},{"issue":"1","key":"19_CR3","doi-asserted-by":"publisher","first-page":"30","DOI":"10.1109\/TASL.2011.2134090","volume":"20","author":"GE Dahl","year":"2012","unstructured":"Dahl, G.E., Yu, D., Deng, L., Acero, A.: Context-dependent pre-trained deep neural networks for large-vocabulary speech recognition. IEEE Trans. Audio Speech Lang. Process. 20(1), 30\u201342 (2012)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"19_CR4","doi-asserted-by":"crossref","unstructured":"Hori, T., Watanabe, S., Zhang, Y., Chan, W.: Advances in joint CTC-attention based end-to-end speech recognition with a deep CNN encoder and RNN-LM. arXiv preprint arXiv:1706.02737 (2017)","DOI":"10.21437\/Interspeech.2017-1296"},{"key":"19_CR5","doi-asserted-by":"crossref","unstructured":"Kannan, A., Wu, Y., Nguyen, P., Sainath, T.N., Chen, Z., Prabhavalkar, R.: An analysis of incorporating an external language model into a sequence-to-sequence model. arXiv preprint arXiv:1712.01996 (2017)","DOI":"10.1109\/ICASSP.2018.8462682"},{"key":"19_CR6","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)"},{"key":"19_CR7","doi-asserted-by":"crossref","unstructured":"Dong, L., Xu, S., Xu, B.: Speech-transformer: a no-recurrence sequence-to-sequence model for speech recognition. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP, pp. 5884\u20135888. IEEE (2018)","DOI":"10.1109\/ICASSP.2018.8462506"},{"key":"19_CR8","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"724","DOI":"10.1007\/11939993_73","volume-title":"Chinese Spoken Language Processing","author":"Y Liu","year":"2006","unstructured":"Liu, Y., Fung, P., Yang, Y., Cieri, C., Huang, S., Graff, D.: HKUST\/MTS: a very large scale Mandarin telephone speech corpus. In: Huo, Q., Ma, B., Chng, E.-S., Li, H. (eds.) ISCSLP 2006. LNCS, vol. 4274, pp. 724\u2013735. Springer, Heidelberg (2006). https:\/\/doi.org\/10.1007\/11939993_73"},{"key":"19_CR9","doi-asserted-by":"crossref","unstructured":"Prabhavalkar, R., Sainath, T.N., Li, B., Rao, K., Jaitly, N.: An analysis of attention in sequence-to-sequence models. In: Proceedings of Interspeech (2017)","DOI":"10.21437\/Interspeech.2017-232"},{"key":"19_CR10","doi-asserted-by":"crossref","unstructured":"Sainath, T.N., et al.: No need for a lexicon? Evaluating the value of the pronunciation lexica in end-to-end models. arXiv preprint arXiv:1712.01864 (2017)","DOI":"10.1109\/ICASSP.2018.8462380"},{"key":"19_CR11","doi-asserted-by":"crossref","unstructured":"Sak, H., Senior, A., Beaufays, F.: Long short-term memory recurrent neural network architectures for large scale acoustic modeling. In: Fifteenth Annual Conference of the International Speech Communication Association (2014)","DOI":"10.21437\/Interspeech.2014-80"},{"key":"19_CR12","doi-asserted-by":"crossref","unstructured":"Sak, H., Senior, A., Rao, K., Beaufays, F.: Fast and accurate recurrent neural network acoustic models for speech recognition. arXiv preprint arXiv:1507.06947 (2015)","DOI":"10.21437\/Interspeech.2015-350"},{"key":"19_CR13","doi-asserted-by":"crossref","unstructured":"Senior, A., Sak, H., Shafran, I.: Context dependent phone models for LSTM RNN acoustic modelling. In: 2015 IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP, pp. 4585\u20134589. IEEE (2015)","DOI":"10.1109\/ICASSP.2015.7178839"},{"key":"19_CR14","doi-asserted-by":"crossref","unstructured":"Sennrich, R., Haddow, B., Birch, A.: Neural machine translation of rare words with subword units. arXiv preprint arXiv:1508.07909 (2015)","DOI":"10.18653\/v1\/P16-1162"},{"key":"19_CR15","doi-asserted-by":"crossref","unstructured":"Shan, C., Zhang, J., Wang, Y., Xie, L.: Attention-based end-to-end speech recognition on voice search. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4764\u20134768 (2018)","DOI":"10.1109\/ICASSP.2018.8462492"},{"key":"19_CR16","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Vanhoucke, V., Ioffe, S., Shlens, J., Wojna, Z.: Rethinking the inception architecture for computer vision. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2818\u20132826 (2016)","DOI":"10.1109\/CVPR.2016.308"},{"key":"19_CR17","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, pp. 6000\u20136010 (2017)"},{"key":"19_CR18","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Xu, S., Xu, B.: Multidimensional residual learning based on recurrent neural networks for acoustic modeling. In: Interspeech, pp. 3419\u20133423 (2016)","DOI":"10.21437\/Interspeech.2016-677"},{"key":"19_CR19","doi-asserted-by":"crossref","unstructured":"Zhou, S., Dong, L., Xu, S., Xu, B.: Syllable-based sequence-to-sequence speech recognition with the transformer in Mandarin Chinese. ArXiv e-prints, April 2018","DOI":"10.21437\/Interspeech.2018-1107"},{"key":"19_CR20","doi-asserted-by":"crossref","unstructured":"Zou, W., Jiang, D., Zhao, S., Li, X.: A comparable study of modeling units for end-to-end Mandarin speech recognition. arXiv preprint arXiv:1805.03832 (2018)","DOI":"10.1109\/ISCSLP.2018.8706661"}],"container-title":["Lecture Notes in Computer Science","Neural Information Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-04221-9_19","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,13]],"date-time":"2024-03-13T12:40:55Z","timestamp":1710333655000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-04221-9_19"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018]]},"ISBN":["9783030042202","9783030042219"],"references-count":20,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-04221-9_19","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2018]]},"assertion":[{"value":"17 November 2018","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICONIP","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Neural Information Processing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Siem Reap","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Cambodia","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2018","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"13 December 2018","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16 December 2018","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"25","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"iconip2018","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/conference.cs.cityu.edu.hk\/iconip\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Single-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"EasyChair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"575","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"401","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"70% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"6","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}