{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T08:53:04Z","timestamp":1763196784849,"version":"3.45.0"},"publisher-location":"Singapore","reference-count":57,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819533480","type":"print"},{"value":"9789819533497","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,11,16]],"date-time":"2025-11-16T00:00:00Z","timestamp":1763251200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,11,16]],"date-time":"2025-11-16T00:00:00Z","timestamp":1763251200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-3349-7_20","type":"book-chapter","created":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T08:49:39Z","timestamp":1763196579000},"page":"255-268","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Adaptive Inner Speech Text Alignment for\u00a0LLM-Based Speech Translation"],"prefix":"10.1007","author":[{"given":"Henglyu","family":"Liu","sequence":"first","affiliation":[]},{"given":"Andong","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Kehai","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Xuefeng","family":"Bai","sequence":"additional","affiliation":[]},{"given":"Meizhi","family":"Zhong","sequence":"additional","affiliation":[]},{"given":"Yuan","family":"Qiu","sequence":"additional","affiliation":[]},{"given":"Min","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,11,16]]},"reference":[{"key":"20_CR1","doi-asserted-by":"crossref","unstructured":"Alinejad, A., Sarkar, A.: Effectively pretraining a speech translation decoder with machine translation data. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 8014\u20138020 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.644"},{"key":"20_CR2","doi-asserted-by":"crossref","unstructured":"Ao, J., et\u00a0al.: Speecht5: unified-modal encoder-decoder pre-training for spoken language processing. arXiv preprint arXiv:2110.07205 (2021)","DOI":"10.18653\/v1\/2022.acl-long.393"},{"key":"20_CR3","unstructured":"Baevski, A., Zhou, Y., Mohamed, A., Auli, M.: wav2vec 2.0: a framework for self-supervised learning of speech representations. In: Advances in Neural Information Processing Systems, vol. 33, pp. 12449\u201312460 (2020)"},{"key":"20_CR4","unstructured":"Bapna, A., et al.: Slam: a unified encoder for speech and language modeling via speech-text joint pre-training. arXiv preprint arXiv:2110.10329 (2021)"},{"key":"20_CR5","unstructured":"Brown, T., et al.: Language models are few-shot learners. In: Advances in Neural Information Processing Systems, vol. 33, pp. 1877\u20131901 (2020)"},{"key":"20_CR6","unstructured":"Chen, F., et al.: X-LLM: bootstrapping advanced large language models by treating multi-modalities as foreign languages. arXiv preprint arXiv:2305.04160 (2023)"},{"key":"20_CR7","doi-asserted-by":"crossref","unstructured":"Chen, G., et\u00a0al.: Gigaspeech: an evolving, multi-domain ASR corpus with 10,000 hours of transcribed audio. arXiv preprint arXiv:2106.06909 (2021)","DOI":"10.21437\/Interspeech.2021-1965"},{"key":"20_CR8","unstructured":"Chen, S., et al.: Beats: audio pre-training with acoustic tokenizers. arXiv preprint arXiv:2212.09058 (2022)"},{"key":"20_CR9","doi-asserted-by":"crossref","unstructured":"Chen, X., Zhang, S., Bai, Q., Chen, K., Nakamura, S.: LLAST: improved end-to-end speech translation system leveraged by large language models. arXiv preprint arXiv:2407.15415 (2024)","DOI":"10.18653\/v1\/2024.findings-acl.416"},{"key":"20_CR10","unstructured":"Chiang, W.L., Li, Z., Lin, Z., et\u00a0al.: Vicuna: an open-source chatbot impressing GPT-4 with 90%* chatgpt quality. See https:\/\/vicuna.lmsys.org. Accessed 14 Apr 2023. 2(3), 6 (2023)"},{"key":"20_CR11","doi-asserted-by":"crossref","unstructured":"Chopra, S., Hadsell, R., LeCun, Y.: Learning a similarity metric discriminatively, with application to face verification. In: 2005 IEEE Computer Society Conference on Computer Vision and Pattern Recognition (CVPR 2005), vol.\u00a01, pp. 539\u2013546. IEEE (2005)","DOI":"10.1109\/CVPR.2005.202"},{"key":"20_CR12","unstructured":"Chu, Y., et\u00a0al.: Qwen2-audio technical report. arXiv preprint arXiv:2407.10759 (2024)"},{"key":"20_CR13","unstructured":"Chu, Y., Xu, J., Zhou, X., et\u00a0al.: Qwen-audio: advancing universal audio understanding via unified large-scale audio-language models. arXiv preprint arXiv:2311.07919 (2023)"},{"key":"20_CR14","doi-asserted-by":"crossref","unstructured":"Denkowski, M., Lavie, A.: Meteor universal: language specific translation evaluation for any target language. In: Proceedings of the Ninth Workshop on Statistical Machine Translation, pp. 376\u2013380 (2014)","DOI":"10.3115\/v1\/W14-3348"},{"key":"20_CR15","unstructured":"Di\u00a0Gangi, M.A., Cattoni, R., Bentivogli, L., Negri, M., Turchi, M.: Must-c: a multilingual speech translation corpus. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), pp. 2012\u20132017. Association for Computational Linguistics (2019)"},{"key":"20_CR16","doi-asserted-by":"crossref","unstructured":"Drossos, K., Lipping, S., Virtanen, T.: Clotho: an audio captioning dataset. In: ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 736\u2013740. IEEE (2020)","DOI":"10.1109\/ICASSP40776.2020.9052990"},{"key":"20_CR17","doi-asserted-by":"crossref","unstructured":"Fang, Q., Ye, R., Li, L., Feng, Y., Wang, M.: Stemm: self-learning with speech-text manifold mixup for speech translation. arXiv preprint arXiv:2203.10426 (2022)","DOI":"10.18653\/v1\/2022.acl-long.486"},{"key":"20_CR18","unstructured":"Frogner, C., Zhang, C., Mobahi, H., Araya, M., Poggio, T.A.: Learning with a wasserstein loss. In: Advances in Neural Information Processing Systems, vol. 28 (2015)"},{"key":"20_CR19","unstructured":"Goodfellow, I.J., et al.: Generative adversarial nets. In: Advances in Neural Information Processing Systems, vol. 27 (2014)"},{"key":"20_CR20","doi-asserted-by":"crossref","unstructured":"Graves, A., Fern\u00e1ndez, S., Gomez, F., Schmidhuber, J.: Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In: Proceedings of the 23rd International Conference on Machine Learning, pp. 369\u2013376 (2006)","DOI":"10.1145\/1143844.1143891"},{"key":"20_CR21","unstructured":"Hu, E.J., Shen, Y., Wallis, P., et\u00a0al.: Lora: low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)"},{"key":"20_CR22","doi-asserted-by":"crossref","unstructured":"Hu, S., Zhou, L., Liu, S., et\u00a0al.: Wavllm: towards robust and adaptive speech large language model. arXiv preprint arXiv:2404.00656 (2024)","DOI":"10.18653\/v1\/2024.findings-emnlp.263"},{"key":"20_CR23","doi-asserted-by":"crossref","unstructured":"Iranzo-S\u00e1nchez, J., Silvestre-Cerda, J.A., Jorge, J., et\u00a0al.: Europarl-st: a multilingual corpus for speech translation of parliamentary debates. In: ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 8229\u20138233. IEEE (2020)","DOI":"10.1109\/ICASSP40776.2020.9054626"},{"key":"20_CR24","unstructured":"Kim, C.D., Kim, B., Lee, H., Kim, G.: Audiocaps: generating captions for audios in the wild. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), pp. 119\u2013132 (2019)"},{"key":"20_CR25","unstructured":"Le, P.H., Gong, H., Wang, C., Pino, J., Lecouteux, B., Schwab, D.: Pre-training for speech translation: CTC meets optimal transport. In: International Conference on Machine Learning, pp. 18667\u201318685. PMLR (2023)"},{"key":"20_CR26","doi-asserted-by":"crossref","unstructured":"Li, C., Wong, C., Zhang, S., et\u00a0al.: Llava-med: training a large language-and-vision assistant for biomedicine in one day. In: Advances in Neural Information Processing Systems, vol. 36 (2024)","DOI":"10.32388\/VLXB6M"},{"key":"20_CR27","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: International Conference on Machine Learning, pp. 19730\u201319742. PMLR (2023)"},{"key":"20_CR28","unstructured":"Liang, P.P., Zadeh, A., Morency, L.P.: Foundations and trends in multimodal machine learning: principles, challenges, and open questions. arXiv preprint arXiv:2209.03430 (2022)"},{"key":"20_CR29","unstructured":"Van\u00a0der Maaten, L., Hinton, G.: Visualizing data using t-SNE. J. Mach. Learn. Res. 9(11) (2008)"},{"key":"20_CR30","doi-asserted-by":"crossref","unstructured":"Mei, X., Meng, C., Liu, H., et al.: Wavcaps: a chatgpt-assisted weakly-labelled audio captioning dataset for audio-language multimodal research. IEEE\/ACM Trans. Audio Speech Lang. Process. (2024)","DOI":"10.1109\/TASLP.2024.3419446"},{"key":"20_CR31","first-page":"30","volume":"13","author":"TA Nguyen","year":"2025","unstructured":"Nguyen, T.A., Muller, B., Yu, B., et al.: Spirit-LM: interleaved spoken and written language model. Trans. Assoc. Comput. Linguist. 13, 30\u201352 (2025)","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"20_CR32","unstructured":"OpenAI: Gpt-4 technical report (2024). https:\/\/arxiv.org\/abs\/2303.08774"},{"key":"20_CR33","doi-asserted-by":"crossref","unstructured":"Ouyang, S., Ye, R., Li, L.: Waco: Word-aligned contrastive learning for speech translation. arXiv preprint arXiv:2212.09359 (2022)","DOI":"10.18653\/v1\/2023.acl-long.216"},{"key":"20_CR34","doi-asserted-by":"crossref","unstructured":"Panayotov, V., Chen, G., Povey, D., Khudanpur, S.: Librispeech: an ASR corpus based on public domain audio books. In: 2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5206\u20135210. IEEE (2015)","DOI":"10.1109\/ICASSP.2015.7178964"},{"issue":"3","key":"20_CR35","doi-asserted-by":"publisher","first-page":"1065","DOI":"10.1214\/aoms\/1177704472","volume":"33","author":"E Parzen","year":"1962","unstructured":"Parzen, E.: On estimation of a probability density function and mode. Ann. Math. Stat. 33(3), 1065\u20131076 (1962)","journal-title":"Ann. Math. Stat."},{"key":"20_CR36","doi-asserted-by":"crossref","unstructured":"Peyr\u00e9, G., Cuturi, M., et\u00a0al.: Computational optimal transport: with applications to data science. Found. Trends\u00ae in Mach. Learn. 11(5-6), 355\u2013607 (2019)","DOI":"10.1561\/2200000073"},{"key":"20_CR37","doi-asserted-by":"crossref","unstructured":"Post, M.: A call for clarity in reporting bleu scores. arXiv preprint arXiv:1804.08771 (2018)","DOI":"10.18653\/v1\/W18-6319"},{"key":"20_CR38","unstructured":"Radford, A., Kim, J.W., Xu, T., Brockman, G., McLeavey, C., Sutskever, I.: Robust speech recognition via large-scale weak supervision. In: International Conference on Machine Learning, pp. 28492\u201328518. PMLR (2023)"},{"key":"20_CR39","unstructured":"Rubenstein, P.K., Asawaroengchai, C., Nguyen, D.D., et\u00a0al.: Audiopalm: a large language model that can speak and listen. arXiv preprint arXiv:2306.12925 (2023)"},{"key":"20_CR40","doi-asserted-by":"crossref","unstructured":"Sethiya, N., Maurya, C.K.: End-to-end speech-to-text translation: a survey. Comput. Speech Lang. 101751 (2024)","DOI":"10.1016\/j.csl.2024.101751"},{"key":"20_CR41","unstructured":"Sohn, K.: Improved deep metric learning with multi-class n-pair loss objective. In: Advances in Neural Information Processing Systems, vol. 29 (2016)"},{"key":"20_CR42","unstructured":"Tang, C., Yu, W., Sun, G., et\u00a0al.: Salmonn: towards generic hearing abilities for large language models. arXiv preprint arXiv:2310.13289 (2023)"},{"key":"20_CR43","doi-asserted-by":"crossref","unstructured":"Tang, Y., et\u00a0al.: Unified speech-text pre-training for speech translation and recognition. arXiv preprint arXiv:2204.05409 (2022)","DOI":"10.18653\/v1\/2022.acl-long.105"},{"key":"20_CR44","unstructured":"Touvron, H., Martin, L., Stone, K., et\u00a0al.: Llama 2: open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)"},{"key":"20_CR45","doi-asserted-by":"crossref","unstructured":"Tsiamas, I., G\u00e1llego, G.I., Fonollosa, J.A., Costa-juss\u00e0, M.R.: Pushing the limits of zero-shot end-to-end speech translation. arXiv preprint arXiv:2402.10422 (2024)","DOI":"10.18653\/v1\/2024.findings-acl.847"},{"key":"20_CR46","unstructured":"Wang, C., Pino, J., Wu, A., Gu, J.: CoVoST: A diverse multilingual speech-to-text translation corpus. In: Proceedings of The 12th Language Resources and Evaluation Conference, pp. 4197\u20134203. European Language Resources Association, Marseille, France (2020). https:\/\/www.aclweb.org\/anthology\/2020.lrec-1.517"},{"key":"20_CR47","doi-asserted-by":"crossref","unstructured":"Wang, C., Wu, A., Gu, J., Pino, J.: Covost 2 and massively multilingual speech translation. In: Interspeech, pp. 2247\u20132251 (2021)","DOI":"10.21437\/Interspeech.2021-2027"},{"key":"20_CR48","unstructured":"Wang, C., et al.: BLSP: bootstrapping language-speech pre-training via behavior alignment of continuation writing. arXiv preprint arXiv:2309.00916 (2023)"},{"key":"20_CR49","unstructured":"Wang, C., Liao, M., Huang, Z., Zhang, J.: BLSP-KD: bootstrapping language-speech pre-training via knowledge distillation. arXiv preprint arXiv:2405.19041 (2024)"},{"key":"20_CR50","doi-asserted-by":"crossref","unstructured":"Wang, C., Wu, Y., Liu, S., Yang, Z., Zhou, M.: Bridging the gap between pre-training and fine-tuning for end-to-end speech translation. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a034, pp. 9161\u20139168 (2020)","DOI":"10.1609\/aaai.v34i05.6452"},{"key":"20_CR51","doi-asserted-by":"crossref","unstructured":"Xu, C., et\u00a0al.: Stacked acoustic-and-textual encoding: integrating the pre-trained models into speech translation encoders. arXiv preprint arXiv:2105.05752 (2021)","DOI":"10.18653\/v1\/2021.acl-long.204"},{"key":"20_CR52","unstructured":"Yang, A., et\u00a0al.: Qwen2 technical report (2024). https:\/\/arxiv.org\/abs\/2407.10671"},{"key":"20_CR53","doi-asserted-by":"crossref","unstructured":"Zhang, D., et al.: Speechgpt: empowering large language models with intrinsic cross-modal conversational abilities. arXiv preprint arXiv:2305.11000 (2023)","DOI":"10.18653\/v1\/2023.findings-emnlp.1055"},{"key":"20_CR54","unstructured":"Zhang, H., Si, N., Chen, Y., et\u00a0al.: Tuning large language model for end-to-end speech translation. arXiv preprint arXiv:2310.02050 (2023)"},{"key":"20_CR55","unstructured":"Zhang, H., Chen, K., Bai, X., Xiang, Y., Zhang, M.: Lingualift: an effective two-stage instruction tuning framework for low-resource language tasks. arXiv preprint arXiv:2412.12499 (2024)"},{"key":"20_CR56","doi-asserted-by":"crossref","unstructured":"Zhong, M., Liu, L., Chen, K., Yang, M., Zhang, M.: Context consistency between training and inference in simultaneous machine translation. In: Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 13465\u201313476 (2024)","DOI":"10.18653\/v1\/2024.acl-long.727"},{"key":"20_CR57","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Fang, Q., Feng, Y.: CMOT: cross-modal mixup via optimal transport for speech translation. arXiv preprint arXiv:2305.14635 (2023)","DOI":"10.18653\/v1\/2023.acl-long.436"}],"container-title":["Lecture Notes in Computer Science","Natural Language Processing and Chinese Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-3349-7_20","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T08:49:48Z","timestamp":1763196588000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-3349-7_20"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,16]]},"ISBN":["9789819533480","9789819533497"],"references-count":57,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-3349-7_20","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11,16]]},"assertion":[{"value":"16 November 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"NLPCC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"CCF International Conference on Natural Language Processing and Chinese Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Urumqi","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7 August 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"9 August 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"nlpcc2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/tcci.ccf.org.cn\/conference\/2025\/index.php","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}