{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T21:51:27Z","timestamp":1768341087393,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":37,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,11,4]],"date-time":"2024-11-04T00:00:00Z","timestamp":1730678400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,11,4]]},"DOI":"10.1145\/3678957.3685705","type":"proceedings-article","created":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T04:35:53Z","timestamp":1730262953000},"page":"526-535","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["DoubleDistillation: Enhancing LLMs for Informal Text Analysis using Multistage Knowledge Distillation from Speech and Text"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9722-4243","authenticated-orcid":false,"given":"Fatema","family":"Hasan","sequence":"first","affiliation":[{"name":"Department of Information Systems, University of Maryland, Baltimore County, United States"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3412-0732","authenticated-orcid":false,"given":"Yulong","family":"Li","sequence":"additional","affiliation":[{"name":"IBM Research AI, United States"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0935-4182","authenticated-orcid":false,"given":"James R.","family":"Foulds","sequence":"additional","affiliation":[{"name":"Department of Information Systems, University of Maryland, Baltimore County, United States"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5989-8543","authenticated-orcid":false,"given":"Shimei","family":"Pan","sequence":"additional","affiliation":[{"name":"Department of Information Systems, University of Maryland, Baltimore County, United States"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-7097-4891","authenticated-orcid":false,"given":"Bishwaranjan","family":"Bhattacharjee","sequence":"additional","affiliation":[{"name":"IBM Research AI, United States"}]}],"member":"320","published-online":{"date-parts":[[2024,11,4]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems 33","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Yuhao Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems 33 (2020), 12449\u201312460."},{"key":"e_1_3_2_2_2_1","volume-title":"IEMOCAP: Interactive emotional dyadic motion capture database. Language resources and evaluation 42","author":"Busso Carlos","year":"2008","unstructured":"Carlos Busso, Murtaza Bulut, Chi-Chun Lee, Abe Kazemzadeh, Emily Mower, Samuel Kim, Jeannette\u00a0N Chang, Sungbok Lee, and Shrikanth\u00a0S Narayanan. 2008. IEMOCAP: Interactive emotional dyadic motion capture database. Language resources and evaluation 42 (2008), 335\u2013359."},{"key":"e_1_3_2_2_3_1","volume-title":"Towards multimodal sarcasm detection (an _obviously_ perfect paper). arXiv preprint arXiv:1906.01815","author":"Castro Santiago","year":"2019","unstructured":"Santiago Castro, Devamanyu Hazarika, Ver\u00f3nica P\u00e9rez-Rosas, Roger Zimmermann, Rada Mihalcea, and Soujanya Poria. 2019. Towards multimodal sarcasm detection (an _obviously_ perfect paper). arXiv preprint arXiv:1906.01815 (2019)."},{"key":"e_1_3_2_2_4_1","volume-title":"On the Transferability of Whisper-based Representations for\" In-the-Wild\" Cross-Task Downstream Speech Applications. arXiv preprint arXiv:2305.14546","author":"Chemudupati Vamsikrishna","year":"2023","unstructured":"Vamsikrishna Chemudupati, Marzieh Tahaei, Heitor Guimaraes, Arthur Pimentel, Anderson Avila, Mehdi Rezagholizadeh, Boxing Chen, and Tiago Falk. 2023. On the Transferability of Whisper-based Representations for\" In-the-Wild\" Cross-Task Downstream Speech Applications. arXiv preprint arXiv:2305.14546 (2023)."},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"e_1_3_2_2_6_1","volume-title":"UNITER: Learning UNiversal Image-TExt Representations. CoRR abs\/1909.11740","author":"Chen Yen-Chun","year":"2019","unstructured":"Yen-Chun Chen, Linjie Li, Licheng Yu, Ahmed\u00a0El Kholy, Faisal Ahmed, Zhe Gan, Yu Cheng, and Jingjing Liu. 2019. UNITER: Learning UNiversal Image-TExt Representations. CoRR abs\/1909.11740 (2019). arXiv:1909.11740http:\/\/arxiv.org\/abs\/1909.11740"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00511"},{"key":"e_1_3_2_2_8_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_2_9_1","volume-title":"The People\u2019s Speech: A Large-Scale Diverse English Speech Recognition Dataset for Commercial Usage. arXiv preprint arXiv:2111.09344","author":"Galvez Daniel","year":"2021","unstructured":"Daniel Galvez, Greg Diamos, Juan Ciro, Juan\u00a0Felipe Cer\u00f3n, Keith Achorn, Anjali Gopi, David Kanter, Maximilian Lam, Mark Mazumder, and Vijay\u00a0Janapa Reddi. 2021. The People\u2019s Speech: A Large-Scale Diverse English Speech Recognition Dataset for Commercial Usage. arXiv preprint arXiv:2111.09344 (2021)."},{"key":"e_1_3_2_2_10_1","volume-title":"Tools, language and cognition in human evolution","author":"Gibson R","unstructured":"Kathleen\u00a0R Gibson, Kathleen\u00a0Rita Gibson, and Tim Ingold. 1993. Tools, language and cognition in human evolution. Cambridge University Press."},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.5555\/2188385.2188410"},{"key":"e_1_3_2_2_12_1","volume-title":"Teach me with a Whisper: Enhancing Large Language Models for Analyzing Spoken Transcripts using Speech Embeddings. arXiv preprint arXiv:2311.07014","author":"Hasan Fatema","year":"2023","unstructured":"Fatema Hasan, Yulong Li, James Foulds, Shimei Pan, and Bishwaranjan Bhattacharjee. 2023. Teach me with a Whisper: Enhancing Large Language Models for Analyzing Spoken Transcripts using Speech Embeddings. arXiv preprint arXiv:2311.07014 (2023)."},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00065"},{"key":"e_1_3_2_2_14_1","unstructured":"Geoffrey Hinton Oriol Vinyals and Jeff Dean. 2015. Distilling the Knowledge in a Neural Network. arxiv:1503.02531\u00a0[stat.ML]"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"e_1_3_2_2_16_1","volume-title":"Like what you like: Knowledge distill via neuron selectivity transfer. arXiv preprint arXiv:1707.01219","author":"Huang Zehao","year":"2017","unstructured":"Zehao Huang and Naiyan Wang. 2017. Like what you like: Knowledge distill via neuron selectivity transfer. arXiv preprint arXiv:1707.01219 (2017)."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i10.21334"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.5120\/17712-8078"},{"key":"e_1_3_2_2_19_1","volume-title":"Sequence-Level Knowledge Distillation. CoRR abs\/1606.07947","author":"Kim Yoon","year":"2016","unstructured":"Yoon Kim and Alexander\u00a0M. Rush. 2016. Sequence-Level Knowledge Distillation. CoRR abs\/1606.07947 (2016). arXiv:1606.07947http:\/\/arxiv.org\/abs\/1606.07947"},{"key":"e_1_3_2_2_20_1","volume-title":"Unicoder-VL: A Universal Encoder for Vision and Language by Cross-modal Pre-training. CoRR abs\/1908.06066","author":"Li Gen","year":"2019","unstructured":"Gen Li, Nan Duan, Yuejian Fang, Daxin Jiang, and Ming Zhou. 2019. Unicoder-VL: A Universal Encoder for Vision and Language by Cross-modal Pre-training. CoRR abs\/1908.06066 (2019). arXiv:1908.06066http:\/\/arxiv.org\/abs\/1908.06066"},{"key":"e_1_3_2_2_21_1","volume-title":"HERO: Hierarchical Encoder for Video+Language Omni-representation Pre-training. CoRR abs\/2005.00200","author":"Li Linjie","year":"2020","unstructured":"Linjie Li, Yen-Chun Chen, Yu Cheng, Zhe Gan, Licheng Yu, and Jingjing Liu. 2020. HERO: Hierarchical Encoder for Video+Language Omni-representation Pre-training. CoRR abs\/2005.00200 (2020). arXiv:2005.00200https:\/\/arxiv.org\/abs\/2005.00200"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/P14-5010"},{"key":"e_1_3_2_2_24_1","volume-title":"End-to-End Learning of Visual Representations from Uncurated Instructional Videos. CoRR abs\/1912.06430","author":"Miech Antoine","year":"2019","unstructured":"Antoine Miech, Jean-Baptiste Alayrac, Lucas Smaira, Ivan Laptev, Josef Sivic, and Andrew Zisserman. 2019. End-to-End Learning of Visual Representations from Uncurated Instructional Videos. CoRR abs\/1912.06430 (2019). arXiv:1912.06430http:\/\/arxiv.org\/abs\/1912.06430"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i04.5963"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/S18-1001"},{"key":"e_1_3_2_2_27_1","volume-title":"SemEval-2017 task 4: Sentiment analysis in Twitter. arXiv preprint arXiv:1912.00741","author":"Rosenthal Sara","year":"2019","unstructured":"Sara Rosenthal, Noura Farra, and Preslav Nakov. 2019. SemEval-2017 task 4: Sentiment analysis in Twitter. arXiv preprint arXiv:1912.00741 (2019)."},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.3026823"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00926"},{"key":"e_1_3_2_2_30_1","volume-title":"Visual-Grounded Supervision. In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP). 2066\u20132080","author":"Tan Hao","year":"2020","unstructured":"Hao Tan and Mohit Bansal. 2020. Vokenization: Improving Language Understanding with Contextualized, Visual-Grounded Supervision. In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP). 2066\u20132080."},{"key":"e_1_3_2_2_31_1","first-page":"24468","article-title":"Vidlankd: Improving language understanding via video-distilled knowledge transfer","volume":"34","author":"Tang Zineng","year":"2021","unstructured":"Zineng Tang, Jaemin Cho, Hao Tan, and Mohit Bansal. 2021. Vidlankd: Improving language understanding via video-distilled knowledge transfer. Advances in Neural Information Processing Systems 34 (2021), 24468\u201324481.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_32_1","volume-title":"Contrastive representation distillation. arXiv preprint arXiv:1910.10699","author":"Tian Yonglong","year":"2019","unstructured":"Yonglong Tian, Dilip Krishnan, and Phillip Isola. 2019. Contrastive representation distillation. arXiv preprint arXiv:1910.10699 (2019)."},{"key":"e_1_3_2_2_33_1","volume-title":"Multimodal Transformer for Unaligned Multimodal Language Sequences. CoRR abs\/1906.00295","author":"Tsai Hung\u00a0Hubert","year":"2019","unstructured":"Yao-Hung\u00a0Hubert Tsai, Shaojie Bai, Paul\u00a0Pu Liang, J.\u00a0Zico Kolter, Louis-Philippe Morency, and Ruslan Salakhutdinov. 2019. Multimodal Transformer for Unaligned Multimodal Language Sequences. CoRR abs\/1906.00295 (2019). arXiv:1906.00295http:\/\/arxiv.org\/abs\/1906.00295"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1656"},{"key":"e_1_3_2_2_35_1","unstructured":"Austin Waters and Yevgen Chebotar. 2016. Distilling Knowledge from Ensembles of Neural Networks for Speech Recognition. In Interspeech."},{"key":"e_1_3_2_2_36_1","volume-title":"i-code: An integrative and composable multimodal learning framework. arXiv preprint arXiv:2205.01818","author":"Yang Ziyi","year":"2022","unstructured":"Ziyi Yang, Yuwei Fang, Chenguang Zhu, Reid Pryzant, Dongdong Chen, Yu Shi, Yichong Xu, Yao Qian, Mei Gao, Yi-Ling Chen, 2022. i-code: An integrative and composable multimodal learning framework. arXiv preprint arXiv:2205.01818 (2022)."},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1208"}],"event":{"name":"ICMI '24: INTERNATIONAL CONFERENCE ON MULTIMODAL INTERACTION","location":"San Jose Costa Rica","acronym":"ICMI '24"},"container-title":["International Conference on Multimodel Interaction"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3678957.3685705","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3678957.3685705","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:10:12Z","timestamp":1750295412000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3678957.3685705"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,4]]},"references-count":37,"alternative-id":["10.1145\/3678957.3685705","10.1145\/3678957"],"URL":"https:\/\/doi.org\/10.1145\/3678957.3685705","relation":{},"subject":[],"published":{"date-parts":[[2024,11,4]]},"assertion":[{"value":"2024-11-04","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}