{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T20:35:47Z","timestamp":1743021347192,"version":"3.40.3"},"publisher-location":"Cham","reference-count":25,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031404979"},{"type":"electronic","value":"9783031404986"}],"license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023]]},"DOI":"10.1007\/978-3-031-40498-6_27","type":"book-chapter","created":{"date-parts":[[2023,8,22]],"date-time":"2023-08-22T23:02:34Z","timestamp":1692745354000},"page":"304-316","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Towards End-to-End Speech-to-Text Summarization"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-2435-1416","authenticated-orcid":false,"given":"Raul","family":"Monteiro","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5246-0402","authenticated-orcid":false,"given":"Diogo","family":"Pernes","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,8,23]]},"reference":[{"key":"27_CR1","doi-asserted-by":"publisher","unstructured":"Baevski, A., Zhou, Y., Mohamed, A., Auli, M.: wav2vec 2.0: a framework for self-supervised learning of speech representations. In: Larochelle, H., Ranzato, M., Hadsell, R., Balcan, M., Lin, H. (eds.) Advances in Neural Information Processing Systems, vol. 33, pp. 12449\u201312460. Curran Associates, Inc. (2020). https:\/\/doi.org\/10.48550\/arXiv.2006.11477","DOI":"10.48550\/arXiv.2006.11477"},{"issue":"4","key":"27_CR2","doi-asserted-by":"publisher","first-page":"357","DOI":"10.1109\/TASSP.1980.1163420","volume":"28","author":"S Davis","year":"1980","unstructured":"Davis, S., Mermelstein, P.: Comparison of parametric representations for monosyllabic word recognition in continuously spoken sentences. IEEE Trans. Acoust. Speech Signal Process. 28(4), 357\u2013366 (1980). https:\/\/doi.org\/10.1109\/TASSP.1980.1163420","journal-title":"IEEE Trans. Acoust. Speech Signal Process."},{"key":"27_CR3","unstructured":"Evain, S., et al.: Task agnostic and task specific self-supervised learning from speech with lebenchmark. In: Thirty-Fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2) (2021)"},{"key":"27_CR4","doi-asserted-by":"publisher","unstructured":"Ferreira, D.C., Martins, A.F.T., Almeida, M.S.C.: Jointly learning to embed and predict with multiple languages. In: Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics, Berlin, Germany (Volume 1: Long Papers), pp. 2019\u20132028. Association for Computational Linguistics (2016). https:\/\/doi.org\/10.18653\/v1\/P16-1190","DOI":"10.18653\/v1\/P16-1190"},{"key":"27_CR5","doi-asserted-by":"publisher","unstructured":"Furui, S.: Speaker-independent isolated word recognition based on emphasized spectral dynamics. In: IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 1986, vol. 11, pp. 1991\u20131994 (1986). https:\/\/doi.org\/10.1109\/ICASSP.1986.1168654","DOI":"10.1109\/ICASSP.1986.1168654"},{"key":"27_CR6","doi-asserted-by":"publisher","first-page":"49","DOI":"10.1016\/j.eswa.2018.12.011","volume":"121","author":"S Gupta","year":"2019","unstructured":"Gupta, S., Gupta, S.K.: Abstractive summarization: an overview of the state of the art. Expert Syst. Appl. 121, 49\u201365 (2019). https:\/\/doi.org\/10.1016\/j.eswa.2018.12.011","journal-title":"Expert Syst. Appl."},{"key":"27_CR7","doi-asserted-by":"publisher","unstructured":"Lewis, M., et al.: BART: denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. 7871\u20137880. Association for Computational Linguistics, Online (2020). https:\/\/doi.org\/10.18653\/v1\/2020.acl-main.703","DOI":"10.18653\/v1\/2020.acl-main.703"},{"key":"27_CR8","doi-asserted-by":"publisher","unstructured":"Martin, L., et al.: CamemBERT: a tasty French language model. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. 7203\u20137219. Association for Computational Linguistics, Online (2020). https:\/\/doi.org\/10.18653\/v1\/2020.acl-main.645","DOI":"10.18653\/v1\/2020.acl-main.645"},{"key":"27_CR9","doi-asserted-by":"publisher","unstructured":"Matsuura, K., et al.: Leveraging large text corpora for end-to-end speech summarization (2023). https:\/\/doi.org\/10.48550\/arXiv.2303.00978","DOI":"10.48550\/arXiv.2303.00978"},{"key":"27_CR10","doi-asserted-by":"publisher","unstructured":"Morcos, A., Raghu, M., Bengio, S.: Insights on representational similarity in neural networks with canonical correlation. In: Bengio, S., Wallach, H., Larochelle, H., Grauman, K., Cesa-Bianchi, N., Garnett, R. (eds.) Advances in Neural Information Processing Systems, vol. 31. Curran Associates, Inc. (2018). https:\/\/doi.org\/10.48550\/arXiv.1806.05759","DOI":"10.48550\/arXiv.1806.05759"},{"key":"27_CR11","doi-asserted-by":"publisher","unstructured":"Ogawa, A., Hirao, T., Nakatani, T., Nagata, M.: ILP-based compressive speech summarization with content word coverage maximization and its oracle performance analysis. In: IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP 2019, pp. 7190\u20137194 (2019). https:\/\/doi.org\/10.1109\/ICASSP.2019.8683543","DOI":"10.1109\/ICASSP.2019.8683543"},{"key":"27_CR12","doi-asserted-by":"publisher","unstructured":"Pasad, A., Chou, J.C., Livescu, K.: Layer-wise analysis of a self-supervised speech representation model. In: 2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU), pp. 914\u2013921 (2021). https:\/\/doi.org\/10.1109\/ASRU51503.2021.9688093","DOI":"10.1109\/ASRU51503.2021.9688093"},{"key":"27_CR13","doi-asserted-by":"publisher","unstructured":"Paulus, R., Xiong, C., Socher, R.: A deep reinforced model for abstractive summarization. In: International Conference on Learning Representations (2018). https:\/\/doi.org\/10.48550\/arXiv.1705.04304","DOI":"10.48550\/arXiv.1705.04304"},{"key":"27_CR14","doi-asserted-by":"publisher","unstructured":"Pennington, J., Socher, R., Manning, C.D.: Glove: global vectors for word representation. In: Empirical Methods in Natural Language Processing (EMNLP), pp. 1532\u20131543 (2014). https:\/\/doi.org\/10.3115\/v1\/D14-1162","DOI":"10.3115\/v1\/D14-1162"},{"key":"27_CR15","doi-asserted-by":"publisher","unstructured":"Pernes, D., Mendes, A., Martins, A.F.T.: Improving abstractive summarization with energy-based re-ranking. In: Proceedings of the 2nd Workshop on Natural Language Generation, Evaluation, and Metrics, Abu Dhabi, United Arab Emirates, pp. 1\u201317. Association for Computational Linguistics (2022). https:\/\/doi.org\/10.48550\/arXiv.2210.15553","DOI":"10.48550\/arXiv.2210.15553"},{"key":"27_CR16","unstructured":"Peters, B., Correia, G., Mihaylova, T.: An exploration of teacher forcing techniques for neural machine translation (2018)"},{"key":"27_CR17","doi-asserted-by":"publisher","unstructured":"Radford, A., Kim, J.W., Xu, T., Brockman, G., McLeavey, C., Sutskever, I.: Robust speech recognition via large-scale weak supervision (2022). https:\/\/doi.org\/10.48550\/ARXIV.2212.04356","DOI":"10.48550\/ARXIV.2212.04356"},{"key":"27_CR18","doi-asserted-by":"publisher","unstructured":"Rezazadegan, D., et al.: Automatic speech summarisation: a scoping review (2020). https:\/\/doi.org\/10.48550\/arXiv.2008.11897","DOI":"10.48550\/arXiv.2008.11897"},{"key":"27_CR19","doi-asserted-by":"publisher","first-page":"264","DOI":"10.1162\/tacl_a_00313","volume":"8","author":"S Rothe","year":"2020","unstructured":"Rothe, S., Narayan, S., Severyn, A.: Leveraging pre-trained checkpoints for sequence generation tasks. Trans. Assoc. Comput. Linguist. 8, 264\u2013280 (2020). https:\/\/doi.org\/10.1162\/tacl_a_00313","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"27_CR20","doi-asserted-by":"publisher","unstructured":"Scialom, T., Dray, P.A., Lamprier, S., Piwowarski, B., Staiano, J.: MLSUM: the multilingual summarization corpus. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 8051\u20138067. Association for Computational Linguistics, Online (2020). https:\/\/doi.org\/10.18653\/v1\/2020.emnlp-main.647","DOI":"10.18653\/v1\/2020.emnlp-main.647"},{"key":"27_CR21","doi-asserted-by":"publisher","unstructured":"Sharma, R., Palaskar, S., Black, A.W., Metze, F.: End-to-end speech summarization using restricted self-attention. In: IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP 2022, pp. 8072\u20138076 (2022). https:\/\/doi.org\/10.1109\/ICASSP43922.2022.9747320","DOI":"10.1109\/ICASSP43922.2022.9747320"},{"key":"27_CR22","doi-asserted-by":"publisher","unstructured":"\u00c1kos T\u00fcndik, M., Kasz\u00e1s, V., Szasz\u00e1k, G.: Assessing the semantic space bias caused by ASR error propagation and its effect on spoken document summarization. In: Proceedings of the Interspeech 2019, pp. 1333\u20131337 (2019). https:\/\/doi.org\/10.21437\/Interspeech.2019-2154","DOI":"10.21437\/Interspeech.2019-2154"},{"key":"27_CR23","doi-asserted-by":"publisher","unstructured":"Weng, S.Y., Lo, T.H., Chen, B.: An effective contextual language modeling framework for speech summarization with augmented features. In: 2020 28th European Signal Processing Conference (EUSIPCO), pp. 316\u2013320 (2021). https:\/\/doi.org\/10.23919\/Eusipco47968.2020.9287432","DOI":"10.23919\/Eusipco47968.2020.9287432"},{"key":"27_CR24","doi-asserted-by":"publisher","unstructured":"Zhang, Y., et al.: An exploratory study on long dialogue summarization: what works and what\u2019s next. In: Findings of the Association for Computational Linguistics: EMNLP 2021, Punta Cana, Dominican Republic, pp. 4426\u20134433. Association for Computational Linguistics (2021). https:\/\/doi.org\/10.18653\/v1\/2021.findings-emnlp.377","DOI":"10.18653\/v1\/2021.findings-emnlp.377"},{"key":"27_CR25","doi-asserted-by":"publisher","unstructured":"Zhuang, L., Wayne, L., Ya, S., Jun, Z.: A robustly optimized BERT pre-training approach with post-training. In: Proceedings of the 20th Chinese National Conference on Computational Linguistics, Huhhot, China, pp. 1218\u20131227. Chinese Information Processing Society of China (2021). https:\/\/doi.org\/10.48550\/arXiv.1907.11692","DOI":"10.48550\/arXiv.1907.11692"}],"container-title":["Lecture Notes in Computer Science","Text, Speech, and Dialogue"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-40498-6_27","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,8,22]],"date-time":"2023-08-22T23:06:19Z","timestamp":1692745579000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-40498-6_27"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"ISBN":["9783031404979","9783031404986"],"references-count":25,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-40498-6_27","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2023]]},"assertion":[{"value":"23 August 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"TSD","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Text, Speech, and Dialogue","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Pilsen","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Czech Republic","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 September 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"6 September 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"26","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"tsd2023","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.kiv.zcu.cz\/tsd2023\/index.php","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMS & back-office system","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"64","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"31","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"48% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"2.56","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}