{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T07:00:29Z","timestamp":1775199629927,"version":"3.50.1"},"reference-count":26,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,12,6]]},"DOI":"10.1109\/asru65441.2025.11434615","type":"proceedings-article","created":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T19:48:04Z","timestamp":1775159284000},"page":"1-7","source":"Crossref","is-referenced-by-count":0,"title":["Improving Multimodal Speech-To-Slide Alignment for Academic Lectures with Vision LLMs"],"prefix":"10.1109","author":[{"given":"Thomas","family":"Ranzenberger","sequence":"first","affiliation":[{"name":"Technische Hochschule N&#x00FC;rnberg,N&#x00FC;rnberg,Germany"}]},{"given":"Dominik","family":"Wagner","sequence":"additional","affiliation":[{"name":"Technische Hochschule N&#x00FC;rnberg,N&#x00FC;rnberg,Germany"}]},{"given":"Steffen","family":"Freisinger","sequence":"additional","affiliation":[{"name":"Technische Hochschule N&#x00FC;rnberg,N&#x00FC;rnberg,Germany"}]},{"given":"Tobias","family":"Bocklet","sequence":"additional","affiliation":[{"name":"Technische Hochschule N&#x00FC;rnberg,N&#x00FC;rnberg,Germany"}]},{"given":"Korbinian","family":"Riedhammer","sequence":"additional","affiliation":[{"name":"Technische Hochschule N&#x00FC;rnberg,N&#x00FC;rnberg,Germany"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.34190\/ecel.21.1.527"},{"key":"ref2","first-page":"168","article-title":"The hochschul-assistenz-system hans: an ml-based learning experience platform","volume-title":"Studientexte zur Sprachkommunikation: Elektronische Sprachsignalverarbeitung","author":"Ranzenberger"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.52403\/ijrr.20220506"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-978"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2014-359"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ISCAS.2003.1206037"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2010.802"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2011.2109727"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-10467-1_27"},{"key":"ref10","first-page":"670","article-title":"Automatic Transcript Generation from Presentation Slides","volume-title":"Proceedings of the 37th Pacific Asia Conference on Language, Information and Computation","author":"Nguyen"},{"key":"ref11","article-title":"Robust speech recognition via large-scale weak supervision","volume-title":"Proceedings of the 40th International Conference on Machine Learning, ser. ICML\u201923","author":"Radford"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1145\/1577802.1577804"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1410"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01598"},{"key":"ref16","article-title":"Pixtral 12b","author":"Agrawal","year":"2024"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00225"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-acl.177"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2019.00159"},{"issue":"8","key":"ref21","first-page":"707","article-title":"Binary codes capable of correcting deletions, insertions and reversals","volume":"10","author":"Levenshtein","year":"1966","journal-title":"doklady Akademii Nauk SSSR"},{"key":"ref22","article-title":"Qwen2-vl: Enhancing vision-language model\u2019s perception of the world at any resolution","author":"Wang","year":"2024"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.373"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref25","article-title":"LoRA: Low-rank adaptation of large language models","volume-title":"International Conference on Learning Representations","author":"Hu"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/SLT61566.2024.10832252"}],"event":{"name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,12,6]]},"end":{"date-parts":[[2025,12,10]]}},"container-title":["2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11434577\/11433836\/11434615.pdf?arnumber=11434615","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T04:57:13Z","timestamp":1775192233000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11434615\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,6]]},"references-count":26,"URL":"https:\/\/doi.org\/10.1109\/asru65441.2025.11434615","relation":{},"subject":[],"published":{"date-parts":[[2025,12,6]]}}}