{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,16]],"date-time":"2026-02-16T18:15:52Z","timestamp":1771265752209,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":19,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-sa\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100006374","name":"European Commission","doi-asserted-by":"publisher","award":["101071191 ? HORIZON-EIC-2021-PATHFINDERCHALLENGES-01"],"award-info":[{"award-number":["101071191 ? HORIZON-EIC-2021-PATHFINDERCHALLENGES-01"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Spanish Ministry of Education","award":["PRE2022-105516,PID2021-126061OB-C43,PID2020-118112RB-C21, PID2020-118112RB-C22, PID2023-150584OB-C21, PID2023-150584OB-C22"],"award-info":[{"award-number":["PRE2022-105516,PID2021-126061OB-C43,PID2020-118112RB-C21, PID2020-118112RB-C22, PID2023-150584OB-C21, PID2023-150584OB-C22"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3689062.3689084","type":"proceedings-article","created":{"date-parts":[[2024,10,23]],"date-time":"2024-10-23T18:29:19Z","timestamp":1729708159000},"page":"45-51","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":8,"title":["LLM-Driven Multimodal Fusion for Human Perception Analysis"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-6336-7877","authenticated-orcid":false,"given":"Sergio","family":"Esteban-Romero","sequence":"first","affiliation":[{"name":"THAU Group, IPTC, Universidad Polit\u00e9cnica de Madrid, Madrid, Spain"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-2769-9752","authenticated-orcid":false,"given":"Iv\u00e1n","family":"Mart\u00edn-Fern\u00e1ndez","sequence":"additional","affiliation":[{"name":"THAU Group, IPTC, Universidad Polit\u00e9cnica de Madrid, Madrid, Spain"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4285-6224","authenticated-orcid":false,"given":"Manuel","family":"Gil-Mart\u00edn","sequence":"additional","affiliation":[{"name":"THAU Group, IPTC, Universidad Polit\u00e9cnica de Madrid, Madrid, Spain"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6266-5321","authenticated-orcid":false,"given":"David","family":"Griol-Barres","sequence":"additional","affiliation":[{"name":"Universidad de Granada, Granada, Spain"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8891-5237","authenticated-orcid":false,"given":"Zoraida","family":"Callejas-Carri\u00f3n","sequence":"additional","affiliation":[{"name":"Universidad de Granada, Granada, Spain"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3877-0089","authenticated-orcid":false,"given":"Fernando","family":"Fern\u00e1ndez-Mart\u00ednez","sequence":"additional","affiliation":[{"name":"THAU Group, IPTC, Universidad Polit\u00e9cnica de Madrid, Madrid, Spain"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1016\/B978-0--12--800284--1.00004--7"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Shahin Amiriparian Lukas Christ Alexander Kathan Maurice Gerczuk Niklas M\u00fcller Steffen Klug Lukas Stappen Andreas K\u00f6nig Erik Cambria Bj\u00f6rn Schuller et al. 2024. The MuSe 2024 Multimodal Sentiment Analysis Challenge: Social Perception and Humor Recognition. arXiv e-prints (2024) arXiv--2406.","DOI":"10.1145\/3689062.3689088"},{"key":"e_1_3_2_1_3_1","volume-title":"Lin (Eds.)","volume":"33","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Yuhao Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations. In Advances in Neural Information Processing Systems, H. Larochelle, M. Ranzato, R. Hadsell, M.F. Balcan, and H. Lin (Eds.), Vol. 33. Curran Associates, Inc., 12449--12460. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2020\/file\/92d1e1eb1cd6f9fba3227870bb6d7f07-Paper.pdf"},{"key":"e_1_3_2_1_4_1","unstructured":"Jinze Bai Shuai Bai Shusheng Yang Shijie Wang Sinan Tan Peng Wang Junyang Lin Chang Zhou and Jingren Zhou. 2023. Qwen-VL: A Versatile Vision-Language Model for Understanding Localization Text Reading and Beyond. arxiv: 2308.12966 [cs.CV] https:\/\/arxiv.org\/abs\/2308.12966"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.21437\/odyssey.2024-41"},{"key":"e_1_3_2_1_6_1","unstructured":"Yunfei Chu Jin Xu Xiaohuan Zhou Qian Yang Shiliang Zhang Zhijie Yan Chang Zhou and Jingren Zhou. 2023. Qwen-Audio: Advancing Universal Audio Understanding via Unified Large-Scale Audio-Language Models. arxiv: 2311.07919 [eess.AS] https:\/\/arxiv.org\/abs\/2311.07919"},{"key":"e_1_3_2_1_7_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arxiv","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arxiv: 1810.04805 [cs.CL] https:\/\/arxiv.org\/abs\/1810.04805"},{"key":"e_1_3_2_1_8_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_9_1","volume-title":"Social cognition and social perception. Annual review of psychology","author":"Tory Higgins E","year":"1987","unstructured":"E Tory Higgins and John A Bargh. 1987. Social cognition and social perception. Annual review of psychology, Vol. 38, 1 (1987), 369--425."},{"key":"e_1_3_2_1_10_1","unstructured":"Edward J. Hu Yelong Shen Phillip Wallis Zeyuan Allen-Zhu Yuanzhi Li Shean Wang Lu Wang and Weizhu Chen. 2021. LoRA: Low-Rank Adaptation of Large Language Models. arxiv: 2106.09685 [cs.CL] https:\/\/arxiv.org\/abs\/2106.09685"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1080"},{"key":"e_1_3_2_1_12_1","unstructured":"Murathan Kurfali Jonas K Olofsson and Thomas H\u00f6rberg. 2023. Enhancing Multimodal Language Models with Olfactory Information. (2023)."},{"key":"e_1_3_2_1_13_1","volume-title":"Thumbs up? Sentiment classification using machine learning techniques. arXiv preprint cs\/0205070","author":"Pang Bo","year":"2002","unstructured":"Bo Pang, Lillian Lee, and Shivakumar Vaithyanathan. 2002. Thumbs up? Sentiment classification using machine learning techniques. arXiv preprint cs\/0205070 (2002)."},{"key":"e_1_3_2_1_14_1","volume-title":"Emotion recognition from speech using wav2vec 2.0 embeddings. arXiv preprint arXiv:2104.03502","author":"Pepino Leonardo","year":"2021","unstructured":"Leonardo Pepino, Pablo Riera, and Luciana Ferrer. 2021. Emotion recognition from speech using wav2vec 2.0 embeddings. arXiv preprint arXiv:2104.03502 (2021)."},{"key":"e_1_3_2_1_15_1","volume-title":"International conference on machine learning. PMLR, 28492--28518","author":"Radford Alec","year":"2023","unstructured":"Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever. 2023. Robust speech recognition via large-scale weak supervision. In International conference on machine learning. PMLR, 28492--28518."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298682"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2017.08.003"},{"key":"e_1_3_2_1_18_1","volume-title":"Gemma: Open Models Based on Gemini Research and Technology. arxiv: 2403.08295 [cs.CL] https:\/\/arxiv.org\/abs\/2403.08295","author":"Team Gemma","year":"2024","unstructured":"Gemma Team. 2024. Gemma: Open Models Based on Gemini Research and Technology. arxiv: 2403.08295 [cs.CL] https:\/\/arxiv.org\/abs\/2403.08295"},{"key":"e_1_3_2_1_19_1","unstructured":"The ModelScope Team. 2024. SWIFT:Scalable lightWeight Infrastructure for Fine-Tuning. https:\/\/github.com\/modelscope\/swift."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 5th on Multimodal Sentiment Analysis Challenge and Workshop: Social Perception and Humor"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3689062.3689084","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3689062.3689084","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T18:23:44Z","timestamp":1755973424000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3689062.3689084"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":19,"alternative-id":["10.1145\/3689062.3689084","10.1145\/3689062"],"URL":"https:\/\/doi.org\/10.1145\/3689062.3689084","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}