{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,13]],"date-time":"2026-02-13T20:47:17Z","timestamp":1771015637401,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":29,"publisher":"ACM","funder":[{"name":"Spanish Ministry of Science and Innovation | GOMINOLA","award":["PID2020-118112RB-C22"],"award-info":[{"award-number":["PID2020-118112RB-C22"]}]},{"name":"Spanish Ministry of Science and Innovation | TRUSTBOOST","award":["PID2023-150584OB-C21"],"award-info":[{"award-number":["PID2023-150584OB-C21"]}]},{"name":"Spanish Ministry of Science and Innovation | BeWord","award":["PID2021-126061OB-C43"],"award-info":[{"award-number":["PID2021-126061OB-C43"]}]},{"name":"Spanish Ministry of Education","award":["PRE2022-105516"],"award-info":[{"award-number":["PRE2022-105516"]}]},{"DOI":"10.13039\/501100000780","name":"European Commission","doi-asserted-by":"publisher","award":["101071191 \u2014 HORIZON-EIC-2021-PATHFINDERCHALLENGES-01"],"award-info":[{"award-number":["101071191 \u2014 HORIZON-EIC-2021-PATHFINDERCHALLENGES-01"]}],"id":[{"id":"10.13039\/501100000780","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,9,16]]},"DOI":"10.1145\/3742886.3756724","type":"proceedings-article","created":{"date-parts":[[2025,9,30]],"date-time":"2025-09-30T14:55:41Z","timestamp":1759244141000},"page":"1-9","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["LLM-Driven Multimodal Video-Text Fusion for Isolated Sign Language Recognition"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-6336-7877","authenticated-orcid":false,"given":"Sergio","family":"Esteban-Romero","sequence":"first","affiliation":[{"name":"Universidad Polit\u00e9cnica de Madrid, Madrid, Madrid, Spain"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5369-856X","authenticated-orcid":false,"given":"Cristina","family":"Luna-Jim\u00e9nez","sequence":"additional","affiliation":[{"name":"Chair for Human-Centered Artificial Intelligence, Augsburg University, Augsburg, Augsburg, Germany"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4285-6224","authenticated-orcid":false,"given":"Manuel","family":"Gil-Mart\u00edn","sequence":"additional","affiliation":[{"name":"Universidad Polit\u00e9cnica de Madrid, Madrid, Madrid, Spain"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3877-0089","authenticated-orcid":false,"given":"Fernando","family":"Fern\u00e1ndez-Mart\u00ednez","sequence":"additional","affiliation":[{"name":"Electronics Engineering Departament, Universidad Polit\u00e9cnica de Madrid, Madrid, Madrid, Spain"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2367-162X","authenticated-orcid":false,"given":"Elisabeth","family":"Andre","sequence":"additional","affiliation":[{"name":"University of Augsburg, Augsburg, Germany"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,9,30]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10445841"},{"key":"e_1_3_3_2_3_2","volume-title":"European Conference on Computer Vision","author":"Albanie Samuel","year":"2020","unstructured":"Samuel Albanie, G\u00fcl Varol, Liliane Momeni, Triantafyllos Afouras, Joon\u00a0Son Chung, Neil Fox, and Andrew Zisserman. 2020. BSL-1K: Scaling up co-articulated sign language recognition using mouthing cues. In European Conference on Computer Vision."},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"publisher","unstructured":"Tobias Baur Alexander Heimerl Florian Lingenfelser Johannes Wagner Michel\u00a0F. Valstar Bj\u00f6rn Schuller and Elisabeth Andr\u00e9. 2020. eXplainable Cooperative Machine Learning with NOVA. KI - K\u00fcnstliche Intelligenz (19 Jan 2020). 10.1007\/s13218-020-00632-3","DOI":"10.1007\/s13218-020-00632-3"},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"publisher","DOI":"10.1145\/3529190.3529202"},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACVW54805.2022.00024"},{"key":"e_1_3_3_2_7_2","unstructured":"Daniel Bolya Po-Yao Huang Peize Sun Jang\u00a0Hyun Cho Andrea Madotto Chen Wei Tengyu Ma Jiale Zhi Jathushan Rajasegaran Hanoona Rasheed et\u00a0al. 2025. Perception encoder: The best visual embeddings are not at the output of the network. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2504.13181 (2025)."},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00236"},{"key":"e_1_3_3_2_9_2","unstructured":"Tim Dettmers Artidoro Pagnoni Ari Holtzman and Luke Zettlemoyer. 2023. QLoRA: Efficient Finetuning of Quantized LLMs. arxiv:https:\/\/arXiv.org\/abs\/2305.14314\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/2305.14314"},{"key":"e_1_3_3_2_10_2","unstructured":"Jacob Devlin Ming-Wei Chang Kenton Lee and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arxiv:https:\/\/arXiv.org\/abs\/1810.04805\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/1810.04805"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.1145\/3689062.3689084"},{"key":"e_1_3_3_2_12_2","unstructured":"An\u00a0Yang et al.2025. Qwen3 Technical Report. arxiv:https:\/\/arXiv.org\/abs\/2505.09388\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2505.09388"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01738"},{"key":"e_1_3_3_2_14_2","first-page":"11087","volume-title":"Proceedings of the IEEE\/CVF international conference on computer vision","author":"Hu Hezhen","year":"2021","unstructured":"Hezhen Hu, Weichao Zhao, Wengang Zhou, Yuechen Wang, and Houqiang Li. 2021. Signbert: pre-training of hand-model-aware representation for sign language recognition. In Proceedings of the IEEE\/CVF international conference on computer vision. 11087\u201311096."},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01890"},{"key":"e_1_3_3_2_16_2","unstructured":"Jungeun Kim Hyeongwoo Jeon Jongseong Bae and Ha\u00a0Young Kim. 2024. Leveraging the Power of MLLMs for Gloss-Free Sign Language Translation. arxiv:https:\/\/arXiv.org\/abs\/2411.16789\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2411.16789"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","unstructured":"Reiner Konrad Thomas Hanke Gabriele Langer Susanne K\u00f6nig Lutz K\u00f6nig Rie Nishio and Anja Regen. 2022. Public DGS Corpus: Annotation Conventions \/ \u00d6ffentliches DGS-Korpus: Annotationskonventionen. 10.25592\/uhhfdm.10251","DOI":"10.25592\/uhhfdm.10251"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV45572.2020.9093512"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"crossref","unstructured":"Dongxu Li Cristian Rodriguez-Opazo Xin Yu and Hongdong Li. 2019. Word-level Deep Sign Language Recognition from Video: A New Large-scale Dataset and Methods Comparison. 2020 IEEE Winter Conference on Applications of Computer Vision (WACV) (2019) 1448\u20131458. https:\/\/api.semanticscholar.org\/CorpusID:204851909","DOI":"10.1109\/WACV45572.2020.9093512"},{"key":"e_1_3_3_2_20_2","unstructured":"Zecheng Li Wengang Zhou Weichao Zhao Kepeng Wu Hezhen Hu and Houqiang Li. 2025. Uni-Sign: Toward Unified Sign Language Understanding at Scale. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2501.15187 (2025)."},{"key":"e_1_3_3_2_21_2","unstructured":"Zecheng Li Wengang Zhou Weichao Zhao Kepeng Wu Hezhen Hu and Houqiang Li. 2025. Uni-Sign: Toward Unified Sign Language Understanding at Scale. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2501.15187 (2025)."},{"key":"e_1_3_3_2_22_2","first-page":"5739","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","volume":"39","author":"Liu Yuqi","year":"2025","unstructured":"Yuqi Liu, Wenqian Zhang, Sihan Ren, Chengyu Huang, Jingyi Yu, and Lan Xu. 2025. SCOPE: Sign Language Contextual Processing with Embedding from LLMs. In Proceedings of the AAAI Conference on Artificial Intelligence, Vol.\u00a039. 5739\u20135747."},{"key":"e_1_3_3_2_23_2","volume-title":"Third Workshop on Computer Vision for AR\/VR at IEEE Computer Vision and Pattern Recognition (CVPR) 2019","author":"Lugaresi Camillo","year":"2019","unstructured":"Camillo Lugaresi, Jiuqiang Tang, Hadon Nash, Chris McClanahan, Esha Uboweja, Michael Hays, Fan Zhang, Chuo-Ling Chang, Ming Yong, Juhyun Lee, Wan-Teh Chang, Wei Hua, Manfred Georg, and Matthias Grundmann. 2019. MediaPipe: A Framework for Perceiving and Processing Reality. In Third Workshop on Computer Vision for AR\/VR at IEEE Computer Vision and Pattern Recognition (CVPR) 2019. https:\/\/mixedreality.cs.cornell.edu\/s\/NewTitle_May1_MediaPipe_CVPR_CV4ARVR_Workshop_2019.pdf"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.1145\/3716553.3750772"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","DOI":"10.1145\/3577190.3614143"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"publisher","DOI":"10.1145\/3689062.3689083"},{"key":"e_1_3_3_2_27_2","unstructured":"Alec Radford Jong\u00a0Wook Kim Chris Hallacy Aditya Ramesh Gabriel Goh Sandhini Agarwal Girish Sastry Amanda Askell Pamela Mishkin Jack Clark Gretchen Krueger and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. arxiv:https:\/\/arXiv.org\/abs\/2103.00020\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2103.00020"},{"key":"e_1_3_3_2_28_2","unstructured":"Ilya\u00a0O Tolstikhin Bharath\u00a0K Sriperumbudur and Bernhard Sch\u00f6lkopf. 2016. Minimax estimation of maximum mean discrepancy with radial kernels. Advances in Neural Information Processing Systems 29 (2016)."},{"key":"e_1_3_3_2_29_2","unstructured":"Ryan Wong Necati\u00a0Cihan Camgoz and Richard Bowden. 2024. Sign2GPT: Leveraging Large Language Models for Gloss-Free Sign Language Translation. arxiv:https:\/\/arXiv.org\/abs\/2405.04164\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2405.04164"},{"key":"e_1_3_3_2_30_2","series-title":"(AAAI\u201918\/IAAI\u201918\/EAAI\u201918)","volume-title":"Proceedings of the Thirty-Second AAAI Conference on Artificial Intelligence and Thirtieth Innovative Applications of Artificial Intelligence Conference and Eighth AAAI Symposium on Educational Advances in Artificial Intelligence","author":"Yan Sijie","year":"2018","unstructured":"Sijie Yan, Yuanjun Xiong, and Dahua Lin. 2018. Spatial temporal graph convolutional networks for skeleton-based action recognition. In Proceedings of the Thirty-Second AAAI Conference on Artificial Intelligence and Thirtieth Innovative Applications of Artificial Intelligence Conference and Eighth AAAI Symposium on Educational Advances in Artificial Intelligence (New Orleans, Louisiana, USA) (AAAI\u201918\/IAAI\u201918\/EAAI\u201918). AAAI Press, Article 912, 9\u00a0pages."}],"event":{"name":"IVA Adjunct '25: ACM International Conference on Intelligent Virtual Agents","location":"Berlin Germany","acronym":"IVA Adjunct '25","sponsor":["SIGAI ACM Special Interest Group on Artificial Intelligence"]},"container-title":["Adjunct Proceedings of the 25th ACM International Conference on Intelligent Virtual Agents"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3742886.3756724","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,30]],"date-time":"2025-09-30T14:59:32Z","timestamp":1759244372000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3742886.3756724"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,16]]},"references-count":29,"alternative-id":["10.1145\/3742886.3756724","10.1145\/3742886"],"URL":"https:\/\/doi.org\/10.1145\/3742886.3756724","relation":{},"subject":[],"published":{"date-parts":[[2025,9,16]]},"assertion":[{"value":"2025-09-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}