{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,9]],"date-time":"2025-10-09T00:19:08Z","timestamp":1759969148429,"version":"build-2065373602"},"publisher-location":"New York, NY, USA","reference-count":16,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,5,8]],"date-time":"2025-05-08T00:00:00Z","timestamp":1746662400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,5,8]]},"DOI":"10.1145\/3701716.3717561","type":"proceedings-article","created":{"date-parts":[[2025,5,23]],"date-time":"2025-05-23T16:12:56Z","timestamp":1748016776000},"page":"2176-2180","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Exploring Multimodal Pre-trained Models for Speech Emotion Recognition"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-3578-2649","authenticated-orcid":false,"given":"Zhiyu","family":"Liu","sequence":"first","affiliation":[{"name":"University of Glasgow, Glasgow, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4759-2042","authenticated-orcid":false,"given":"Junchen","family":"Fu","sequence":"additional","affiliation":[{"name":"University of Glasgow, Glasgow, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-2516-8407","authenticated-orcid":false,"given":"Kaiwen","family":"Zheng","sequence":"additional","affiliation":[{"name":"University of Glasgow, Glasgow, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9228-1759","authenticated-orcid":false,"given":"Joemon M.","family":"Jose","sequence":"additional","affiliation":[{"name":"University of Glasgow, Glasgow, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,5,23]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2014.2336244"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2023.01.002"},{"key":"e_1_3_2_2_3_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. NAACL-HLT","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. NAACL-HLT (2019). arxiv: 1810.04805"},{"key":"e_1_3_2_2_4_1","unstructured":"Kate Dupuis and Kathleen MK Pichora-Fuller. 2010. Toronto Emotional Speech Set (TESS). https:\/\/tspace.library.utoronto.ca\/handle\/1807\/24487"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3657725"},{"key":"e_1_3_2_2_6_1","volume-title":"Efficient and Effective Adaptation of Multimodal Foundation Models in Sequential Recommendation. arXiv preprint arXiv:2411.02992","author":"Fu Junchen","year":"2024","unstructured":"Junchen Fu, Xuri Ge, Xin Xin, Alexandros Karatzoglou, Ioannis Arapakis, Kaiwen Zheng, Yongxin Ni, and Joemon M Jose. 2024b. Efficient and Effective Adaptation of Multimodal Foundation Models in Sequential Recommendation. arXiv preprint arXiv:2411.02992 (2024)."},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3616855.3635805"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681443"},{"key":"e_1_3_2_2_9_1","volume-title":"AST: Audio Spectrogram Transformer. ICASSP","author":"Gong Yuan","year":"2021","unstructured":"Yuan Gong, Yu-An Chung, and James Glass. 2021. AST: Audio Spectrogram Transformer. ICASSP (2021). arxiv: 2104.01778"},{"key":"e_1_3_2_2_10_1","volume-title":"ICML. arxiv","author":"Houlsby Neil","year":"1902","unstructured":"Neil Houlsby, Andrei Giurgiu, Stanislaw Jastrzebski, Benjamin Morrone, Quentin de Laroussilhe, Andrea Gesmundo, Mona Attariyan, and Sylvain Gelly. 2019. Parameter-Efficient Transfer Learning for NLP. In ICML. arxiv: 1902.00751"},{"key":"e_1_3_2_2_11_1","unstructured":"Peter Jackson and Sameer Haq. 2014. Surrey Audio-Visual Expressed Emotion (SAVEE) Database. University of Surrey Technical Report (2014). http:\/\/kahlan.eps.surrey.ac.uk\/savee\/"},{"key":"e_1_3_2_2_12_1","unstructured":"Steven R Livingstone and Frank A Russo. 2018. The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS). https:\/\/zenodo.org\/record\/1188976"},{"key":"e_1_3_2_2_13_1","volume-title":"Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer. arXiv preprint arXiv:1701.06538","author":"Shazeer Noam","year":"2017","unstructured":"Noam Shazeer, Azalia Mirhoseini, Krzysztof Maziarz, Andy Davis, Quoc Le, Geoffrey Hinton, and Jeff Dean. 2017. Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer. arXiv preprint arXiv:1701.06538 (2017). arxiv: 1701.06538"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2022.11.005"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.heliyon.2022.e09196"},{"key":"e_1_3_2_2_16_1","volume-title":"A Review of Vision-Language Models and Their Performance on the Hateful Memes Challenge. arXiv preprint arXiv:2305.06159","author":"Zhao Bryan","year":"2023","unstructured":"Bryan Zhao, Andrew Zhang, Blake Watson, Gillian Kearney, and Isaac Dale. 2023. A Review of Vision-Language Models and Their Performance on the Hateful Memes Challenge. arXiv preprint arXiv:2305.06159 (2023). arxiv: 2305.06159"}],"event":{"name":"WWW '25: The ACM Web Conference 2025","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"],"location":"Sydney NSW Australia","acronym":"WWW '25"},"container-title":["Companion Proceedings of the ACM on Web Conference 2025"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3701716.3717561","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3701716.3717561","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,8]],"date-time":"2025-10-08T03:04:50Z","timestamp":1759892690000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3701716.3717561"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,8]]},"references-count":16,"alternative-id":["10.1145\/3701716.3717561","10.1145\/3701716"],"URL":"https:\/\/doi.org\/10.1145\/3701716.3717561","relation":{},"subject":[],"published":{"date-parts":[[2025,5,8]]},"assertion":[{"value":"2025-05-23","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}