{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T14:56:41Z","timestamp":1743001001879,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":28,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819916412"},{"type":"electronic","value":"9789819916429"}],"license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023]]},"DOI":"10.1007\/978-981-99-1642-9_4","type":"book-chapter","created":{"date-parts":[[2023,4,13]],"date-time":"2023-04-13T12:14:57Z","timestamp":1681388097000},"page":"39-50","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Shifted Chunk Encoder for\u00a0Transformer Based Streaming End-to-End ASR"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6482-4522","authenticated-orcid":false,"given":"Fangyuan","family":"Wang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bo","family":"Xu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2023,4,14]]},"reference":[{"key":"4_CR1","doi-asserted-by":"crossref","unstructured":"Li, J., Ye, G., Das, A., Zhao, R., Gong, Y.: Advancing acoustic-to-word CTC model. In: ICASSP 2018\u201343rd IEEE International Conference on Acoustics, Speech and Signal Processing, 22\u201327 April, Seoul, South Korea, pp. 5794\u20135798 (2018)","DOI":"10.1109\/ICASSP.2018.8462017"},{"key":"4_CR2","doi-asserted-by":"crossref","unstructured":"Graves, A., Fernandez, S., Gomez, F., Schmidhuber, J.: Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In: ICML 2006\u201323rd International Conference on Machine Learning, 25\u201329 June, Pittsburgh, Pennsylvania, pp. 369\u2013376 (2006)","DOI":"10.1145\/1143844.1143891"},{"key":"4_CR3","doi-asserted-by":"crossref","unstructured":"Battenberg, E., Chen, J.T., et al.: Exploring neural transducers for end-to-end speech recognition. In: ASRU 2017\u20132017 IEEE Automatic Speech Recognition and Understanding Workshop, 16\u201320 December, Okinawa, Japan, pp. 206\u2013213 (2017)","DOI":"10.1109\/ASRU.2017.8268937"},{"key":"4_CR4","doi-asserted-by":"crossref","unstructured":"Chen, X., Wu, Y., Wang, Z., et al.: Developing real-time streaming transformer transducer for speech recognition on large-scale dataset. In: ICASSP 2021\u201346rd IEEE International Conference on Acoustics, Speech and Signal Processing, 6\u201311 June, Toronto, Ontario, Canada, pp. 5904\u20135908 (2021)","DOI":"10.1109\/ICASSP39728.2021.9413535"},{"key":"4_CR5","doi-asserted-by":"crossref","unstructured":"Chan, W., Jaitly, N., Le, Q., Vinyals, O.: Listen, attend and spell: a neural network for large vocabulary conversational speech recognition. In: ICASSP 2016\u201341rd IEEE International Conference on Acoustics, Speech and Signal Processing, 20\u201325 March, Shanghai, China, pp. 4960\u20134964 (2016)","DOI":"10.1109\/ICASSP.2016.7472621"},{"key":"4_CR6","doi-asserted-by":"crossref","unstructured":"Gulati, A., Qin, J., Chiu, C.C., et al.: Conformer: convolution-augmented transformer for speech recognition. In: Interspeech 2020\u201321rd Annual Conference of the International Speech Communication Association, 25\u201330 October, Shanghai, China, pp. 5036\u20135040 (2020)","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"4_CR7","doi-asserted-by":"crossref","unstructured":"Prabhavalkar, R., Rao, K., Sainath, T.N., Li, B., Johnson, L., Jaitly, N.: A comparison of sequence-to-sequence models for speech recognition. In: Interspeech 2017\u201318rd Annual Conference of the International Speech Communication Association, 20\u201324 August, Stockholm, Stockholm County, Swedenm, pp. 939\u2013939 (2017)","DOI":"10.21437\/Interspeech.2017-233"},{"issue":"8","key":"4_CR8","doi-asserted-by":"publisher","first-page":"1240","DOI":"10.1109\/JSTSP.2017.2763455","volume":"11","author":"S Watanabe","year":"2017","unstructured":"Watanabe, S., Hori, T., Kim, S., Hershey, J.R., Hayashi, T.: Hybrid CTC\/Attention architecture for end-to-end speech recognition. IEEE J. Sel. Top. Sign. Process. 11(8), 1240\u20131253 (2017)","journal-title":"IEEE J. Sel. Top. Sign. Process."},{"key":"4_CR9","doi-asserted-by":"publisher","first-page":"1452","DOI":"10.1109\/TASLP.2020.2987752","volume":"28","author":"HR Miao","year":"2020","unstructured":"Miao, H.R., Cheng, G.F., Zhang, P.Y., Yan, Y.H.: Online Hybrid CTC\/Attention end-to-end automatic speech recognition architecture. IEEE\/ACM Trans. Audio Speech Lang. Process. 28, 1452\u20131465 (2020)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"4_CR10","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., et al.: Attention is all you need. In: NIPS 2017\u201331rd Conference on Neural Information Processing Systems, 4\u20139 December, Long Beach, California, U.S.A., pp. 5998\u20136008 (2017)"},{"key":"4_CR11","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Zhou, S., Xu, S., Xu, B.: Word-level permutation and improved lower frame rate for rnn-based acoustic modeling. In: ICONIP 2017\u201324rd International Conference on Neural Information Processing, 14\u201318 November, Guangzhou, China (2017)","DOI":"10.1007\/978-3-319-70136-3_91"},{"key":"4_CR12","doi-asserted-by":"crossref","unstructured":"Guo, P.C., Boyer, F., Chang, X.K., et al.: Recent developments on Espnet toolkit boosted by conformer. In: ICASSP 2021\u201346rd IEEE International Conference on Acoustics, Speech and Signal Processing, 6\u201311 June, Toronto, Ontario, Canada, pp. 5874\u20135878 (2021)","DOI":"10.1109\/ICASSP39728.2021.9414858"},{"key":"4_CR13","doi-asserted-by":"crossref","unstructured":"Yao, Z., Wu, D., Wang, X., et al.: WeNet: production oriented streaming and non-streaming end-to-end speech recognition toolkit. In: Interspeech 2021\u201322rd Annual Conference of the International Speech Communication Association, 30 August-3 September, Brno, Czech Republic (2021)","DOI":"10.21437\/Interspeech.2021-1983"},{"key":"4_CR14","unstructured":"Yu, J.H., Han, W., et al.: Universal ASR: unify and improve streaming ASR with full-context modeling. arXiv preprint arXiv:2010.06030 (2020)"},{"key":"4_CR15","unstructured":"Tripathi, A., Kim, J., Zhang, Q., et al.: Transformer transducer: one model unifying streaming and non-streaming speech recognition. arXiv preprint arXiv:2010.03192 (2020)"},{"key":"4_CR16","unstructured":"Zhang, B.B., Wu, D., Yao, Z.Y., et al.: Unified streaming and non-streaming two-pass end-to-end model for speech recognition. arXiv preprint arXiv:2012.05481 (2020)"},{"key":"4_CR17","unstructured":"Wu, D., Zhang, B.B., Yang, C., et al.: U2++: unified two-pass bidirectional end-to-end model for speech recognition. arXiv preprint arXiv:2106.05642 (2021)"},{"key":"4_CR18","doi-asserted-by":"crossref","unstructured":"Tian, Z.K., Yi, J.Y., Bai, Y., et al.: Synchronous transformers for end-to-end speech recognition. In: ICASSP 2020\u201345rd IEEE International Conference on Acoustics, Speech and Signal Processing, 4\u20138 May, Barcelona, Spain, pp. 7884\u20137888 (2020)","DOI":"10.1109\/ICASSP40776.2020.9054260"},{"key":"4_CR19","unstructured":"Chiu, C.-C., Raffel, C.: Monotonic chunkwise attention. In: ICLR 2018\u20136rd International Conference on Learning Representations, 30 April-3 May, Vancouver Canada (2018)"},{"key":"4_CR20","doi-asserted-by":"crossref","unstructured":"Zhang, S.L., Gao, Z.F., Luo, H.N., et al.: Streaming chunk-aware multihead attention for online end-to-end speech recognition. In: Interspeech 2020\u201321rd Annual Conference of the International Speech Communication Association, 25\u201330 October, Shanghai, China, pp. 2142\u20132146 (2020)","DOI":"10.21437\/Interspeech.2020-1972"},{"key":"4_CR21","doi-asserted-by":"crossref","unstructured":"Inaguma, H., Mimura, M., Kawahara, T.: Enhancing monotonic multihead attention for streaming ASR. In: Interspeech 2020\u201321rd Annual Conference of the International Speech Communication Association, 25\u201330 October, Shanghai, China, pp. 2137\u20132141 (2020)","DOI":"10.21437\/Interspeech.2020-1780"},{"key":"4_CR22","doi-asserted-by":"crossref","unstructured":"Shi, Y.Y., Wang, Y.Q., Wu, C.Y., et al.: Emformer: efficient memory transformer based acoustic model for low latency streaming speech recognition. In: ICASSP 2021\u201346rd IEEE International Conference on Acoustics, Speech and Signal Processing, 6\u201311 June, Toronto, Ontario, Canada, pp. 6783\u20136787 (2021)","DOI":"10.1109\/ICASSP39728.2021.9414560"},{"key":"4_CR23","doi-asserted-by":"crossref","unstructured":"Liu, Z., Cao, Y.T., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: ICCV 2021\u201346rd International Conference on Computer Vision, 11\u201317 October, Virtual, pp. 10012\u201310022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"4_CR24","doi-asserted-by":"crossref","unstructured":"Bu, H., Du, J., Na, X., Wu, B., Zheng, H.: Aishell-1: an open-source mandarin speech corpus and a speech recognition baseline. In: O-COCOSDA 2017\u201320rd Conference of the Oriental Chapter of the International Coordinating Committee on Speech Databases and Speech I\/O Systems and Assessment, 1\u20133 November, Seoul, South Korea, pp. 1\u20135 (2015)","DOI":"10.1109\/ICSDA.2017.8384449"},{"key":"4_CR25","doi-asserted-by":"crossref","unstructured":"Park, D.S., Chan, W., Zhang, Y., Chiu, C.C., et, al.: Specaugment: a simple data augmentation method for automatic speech recognition. In: Interspeech 2019\u201320rd Annual Conference of the International Speech Communication Association, 15\u201319 September, Graz, Austria, pp. 2613\u20132617 (2019)","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"4_CR26","doi-asserted-by":"crossref","unstructured":"Li, M., Zoril\u0103, C., Doddipatla, R.: Head-synchronous decoding for transformer-based streaming ASR. In: ICASSP 2021\u201346rd IEEE International Conference on Acoustics, Speech and Signal Processing, 6\u201311 June, Toronto, Ontario, Canada, pp. 5909\u20135913 (2021)","DOI":"10.1109\/ICASSP39728.2021.9414103"},{"key":"4_CR27","unstructured":"Wang, Z., Yang, W., Zhou, P., Chen, W.: WNARS: WFST based non-autoregressive streaming end-to-end speech recognition. arXiv preprint arXiv:2104.03587 (2021)"},{"key":"4_CR28","doi-asserted-by":"crossref","unstructured":"An, K., Zheng, H., Ou, Z., Xiang, H., Ding, K., Wan, G.: CUSIDE: chunking, simulating future context and decoding for streaming ASR. arXiv preprint arXiv:2203.16758(2022)","DOI":"10.21437\/Interspeech.2022-11214"}],"container-title":["Communications in Computer and Information Science","Neural Information Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-99-1642-9_4","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,18]],"date-time":"2024-10-18T06:48:23Z","timestamp":1729234103000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-99-1642-9_4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"ISBN":["9789819916412","9789819916429"],"references-count":28,"URL":"https:\/\/doi.org\/10.1007\/978-981-99-1642-9_4","relation":{},"ISSN":["1865-0929","1865-0937"],"issn-type":[{"type":"print","value":"1865-0929"},{"type":"electronic","value":"1865-0937"}],"subject":[],"published":{"date-parts":[[2023]]},"assertion":[{"value":"14 April 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICONIP","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Neural Information Processing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"New Delhi","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"India","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"22 November 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"26 November 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"iconip2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/iconip2022.apnns.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Single-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Easy Chair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"810","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"359","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"44% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"2.65","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"ICONIP 2022 consists of a two-volume set, LNCS & CCIS, which includes 146 and 213 papers","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}