{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T06:24:39Z","timestamp":1774592679217,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":66,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681527","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"1800-1809","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":15,"title":["GLoMo: Global-Local Modal Fusion for Multimodal Sentiment Analysis"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7444-3275","authenticated-orcid":false,"given":"Yan","family":"Zhuang","sequence":"first","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4182-2150","authenticated-orcid":false,"given":"Yanru","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China &amp; Shenzhen Institute for Advanced Study, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-4950-3167","authenticated-orcid":false,"given":"Zheng","family":"Hu","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-3193-1840","authenticated-orcid":false,"given":"Xiaoyue","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0602-8250","authenticated-orcid":false,"given":"Jiawen","family":"Deng","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4860-9184","authenticated-orcid":false,"given":"Fuji","family":"Ren","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China &amp; Shenzhen Institute for Advanced Study, Chengdu, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Manuel Montes-y G\u00f3mez, and Fabio A Gonz\u00e1lez","author":"Arevalo John","year":"2017","unstructured":"John Arevalo, Thamar Solorio, Manuel Montes-y G\u00f3mez, and Fabio A Gonz\u00e1lez. 2017. Gated multimodal units for information fusion. arXiv preprint arXiv:1702.01992 (2017)."},{"key":"e_1_3_2_1_2_1","volume-title":"Multimodal machine learning: A survey and taxonomy","author":"Baltruaitis Tadas","year":"2018","unstructured":"Tadas Baltruaitis, Chaitanya Ahuja, and Louis-Philippe Morency. 2018. Multimodal machine learning: A survey and taxonomy. IEEE transactions on pattern analysis and machine intelligence 41, 2 (2018), 423--443."},{"key":"e_1_3_2_1_3_1","volume-title":"Openface: an open source facial behavior analysis toolkit. In 2016 IEEE winter conference on applications of computer vision (WACV)","author":"Baltruaitis Tadas","unstructured":"Tadas Baltruaitis, Peter Robinson, and Louis-Philippe Morency. 2016. Openface: an open source facial behavior analysis toolkit. In 2016 IEEE winter conference on applications of computer vision (WACV). IEEE, 1--10."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/FG.2018.00019"},{"key":"e_1_3_2_1_5_1","volume-title":"On the Benefits of Early Fusion in Multimodal Representation Learning. In NeurIPS 2020 Workshop SVRHM.","author":"Barnum George","year":"2020","unstructured":"George Barnum, Sabera J Talukder, and Yisong Yue. 2020. On the Benefits of Early Fusion in Multimodal Representation Learning. In NeurIPS 2020 Workshop SVRHM."},{"key":"e_1_3_2_1_6_1","volume-title":"Towards multimodal sarcasm detection (an _obviously_ perfect paper). arXiv preprint arXiv:1906.01815","author":"Castro Santiago","year":"2019","unstructured":"Santiago Castro, Devamanyu Hazarika, Ver\u00f3nica P\u00e9rez-Rosas, Roger Zimmermann, Rada Mihalcea, and Soujanya Poria. 2019. Towards multimodal sarcasm detection (an _obviously_ perfect paper). arXiv preprint arXiv:1906.01815 (2019)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3136755.3136801"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3124365"},{"key":"e_1_3_2_1_9_1","volume-title":"2014 ieee international conference on acoustics, speech and signal processing (icassp)","author":"Degottex Gilles","unstructured":"Gilles Degottex, John Kane, Thomas Drugman, Tuomo Raitio, and Stefan Scherer. 2014. COVAREP?A collaborative voice analysis repository for speech technologies. In 2014 ieee international conference on acoustics, speech and signal processing (icassp). IEEE, 960--964."},{"key":"e_1_3_2_1_10_1","volume-title":"International Conference on Machine Learning. PMLR, 5547--5569","author":"Du Nan","year":"2022","unstructured":"Nan Du, Yanping Huang, Andrew M Dai, Simon Tong, Dmitry Lepikhin, Yuanzhong Xu, Maxim Krikun, Yanqi Zhou, Adams Wei Yu, Orhan Firat, et al. 2022. Glam: Efficient scaling of language models with mixture-of-experts. In International Conference on Machine Learning. PMLR, 5547--5569."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2022.09.025"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548137"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3462244.3479919"},{"key":"e_1_3_2_1_14_1","volume-title":"Louis-Philippe Morency, et al.","author":"Hasan Md Kamrul","year":"2019","unstructured":"Md Kamrul Hasan, Wasifur Rahman, Amir Zadeh, Jianyuan Zhong, Md Iftekhar Tanveer, Louis-Philippe Morency, et al. 2019. UR-FUNNY: A multimodal language dataset for understanding humor. arXiv preprint arXiv:1904.06618 (2019)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413678"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.62"},{"key":"e_1_3_2_1_18_1","volume-title":"International Conference on Learning Representations.","author":"Hjelm R Devon","year":"2018","unstructured":"R Devon Hjelm, Alex Fedorov, Samuel Lavoie-Marchildon, Karan Grewal, Phil Bachman, Adam Trischler, and Yoshua Bengio. 2018. Learning deep representations by mutual information estimation and maximization. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_19_1","volume-title":"Deep multimodal multilinear fusion with high-order polynomial pooling. Advances in Neural Information Processing Systems 32","author":"Hou Ming","year":"2019","unstructured":"Ming Hou, Jiajia Tang, Jianhai Zhang,Wanzeng Kong, and Qibin Zhao. 2019. Deep multimodal multilinear fusion with high-order polynomial pooling. Advances in Neural Information Processing Systems 32 (2019)."},{"key":"e_1_3_2_1_20_1","volume-title":"Proceedings of NAACL-HLT. 4171--4186","author":"Ming-Wei Chang Jacob Devlin","year":"2019","unstructured":"Jacob Devlin Ming-Wei Chang Kenton and Lee Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of NAACL-HLT. 4171--4186."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1007\/s12652-016-0406-z"},{"key":"e_1_3_2_1_22_1","volume-title":"Suzanne Nie, Richard Chen, Zihao Deng, Faisal Mahmood, Ruslan Salakhutdinov, and Louis-Philippe Morency.","author":"Liang Paul Pu","year":"2023","unstructured":"Paul Pu Liang, Yun Cheng, Xiang Fan, Chun Kai Ling, Suzanne Nie, Richard Chen, Zihao Deng, Faisal Mahmood, Ruslan Salakhutdinov, and Louis-Philippe Morency. 2023. Quantifying & modeling feature interactions: An information decomposition framework. arXiv preprint arXiv:2302.12247 (2023)."},{"key":"e_1_3_2_1_23_1","volume-title":"Factorized Contrastive Learning: Going Beyond Multi-view Redundancy. arXiv preprint arXiv:2306.05268","author":"Liang Paul Pu","year":"2023","unstructured":"Paul Pu Liang, Zihao Deng, Martin Ma, James Zou, Louis-Philippe Morency, and Ruslan Salakhutdinov. 2023. Factorized Contrastive Learning: Going Beyond Multi-view Redundancy. arXiv preprint arXiv:2306.05268 (2023)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1014"},{"key":"e_1_3_2_1_25_1","volume-title":"Highmodality multimodal transformer: Quantifying modality & interaction heterogeneity for high-modality representation learning. Transactions on Machine Learning Research","author":"Liang Paul Pu","year":"2022","unstructured":"Paul Pu Liang, Yiwei Lyu, Xiang Fan, Jeffrey Tsaw, Yudong Liu, Shentong Mo, Dani Yogatama, Louis-Philippe Morency, and Russ Salakhutdinov. 2022. Highmodality multimodal transformer: Quantifying modality & interaction heterogeneity for high-modality representation learning. Transactions on Machine Learning Research (2022)."},{"key":"e_1_3_2_1_26_1","volume-title":"Foundations and recent trends in multimodal machine learning: Principles, challenges, and open questions. arXiv preprint arXiv:2209.03430","author":"Liang Paul Pu","year":"2022","unstructured":"Paul Pu Liang, Amir Zadeh, and Louis-Philippe Morency. 2022. Foundations and recent trends in multimodal machine learning: Principles, challenges, and open questions. arXiv preprint arXiv:2209.03430 (2022)."},{"key":"e_1_3_2_1_27_1","volume-title":"Dynamically shifting multimodal representations via hybrid-modal attention for multimodal sentiment analysis","author":"Lin Ronghao","year":"2023","unstructured":"Ronghao Lin and Haifeng Hu. 2023. Dynamically shifting multimodal representations via hybrid-modal attention for multimodal sentiment analysis. IEEE Transactions on Multimedia (2023)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3539618.3591646"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3536221.3556630"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1209"},{"key":"e_1_3_2_1_31_1","volume-title":"DecoupledWeight Decay Regularization. In International Conference on Learning Representations.","author":"Loshchilov Ilya","year":"2018","unstructured":"Ilya Loshchilov and Frank Hutter. 2018. DecoupledWeight Decay Regularization. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_32_1","volume-title":"Scalevlad: Improving multimodal sentiment analysis via multi-scale fusion of locally descriptors. arXiv preprint arXiv:2112.01368","author":"Luo Huaishao","year":"2021","unstructured":"Huaishao Luo, Lei Ji, Yanyong Huang, Bin Wang, Shenggong Ji, and Tianrui Li. 2021. Scalevlad: Improving multimodal sentiment analysis via multi-scale fusion of locally descriptors. arXiv preprint arXiv:2112.01368 (2021)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00258"},{"key":"e_1_3_2_1_34_1","volume-title":"Multimodal Sentiment Analysis with Preferential Fusion and Distance-aware Contrastive Learning. In 2023 IEEE International Conference on Multimedia and Expo (ICME). IEEE, 1367--1372","author":"Ma Feipeng","year":"2023","unstructured":"Feipeng Ma, Yueyi Zhang, and Xiaoyan Sun. 2023. Multimodal Sentiment Analysis with Preferential Fusion and Distance-aware Contrastive Learning. In 2023 IEEE International Conference on Multimedia and Expo (ICME). IEEE, 1367--1372."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1046"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i01.5347"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2022.11.003"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3068598"},{"key":"e_1_3_2_1_39_1","volume-title":"Multimodal information bottleneck: Learning minimal sufficient unimodal and multimodal representations","author":"Mai Sijie","year":"2022","unstructured":"Sijie Mai, Ying Zeng, and Haifeng Hu. 2022. Multimodal information bottleneck: Learning minimal sufficient unimodal and multimodal representations. IEEE Transactions on Multimedia (2022)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2023.101920"},{"key":"e_1_3_2_1_41_1","volume-title":"Hybrid contrastive learning of tri-modal representation for multimodal sentiment analysis","author":"Mai Sijie","year":"2022","unstructured":"Sijie Mai, Ying Zeng, Shuangjia Zheng, and Haifeng Hu. 2022. Hybrid contrastive learning of tri-modal representation for multimodal sentiment analysis. IEEE Transactions on Affective Computing (2022)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01251"},{"key":"e_1_3_2_1_43_1","volume-title":"From utterance to text: The bias of language in speech and writing. Harvard educational review 47, 3","author":"Olson David","year":"1977","unstructured":"David Olson. 1977. From utterance to text: The bias of language in speech and writing. Harvard educational review 47, 3 (1977), 257--281."},{"key":"e_1_3_2_1_44_1","volume-title":"Beneath the tip of the iceberg: Current challenges and new directions in sentiment analysis research","author":"Poria Soujanya","year":"2020","unstructured":"Soujanya Poria, Devamanyu Hazarika, Navonil Majumder, and Rada Mihalcea. 2020. Beneath the tip of the iceberg: Current challenges and new directions in sentiment analysis research. IEEE transactions on affective computing 14, 1 (2020), 108--132."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.214"},{"key":"e_1_3_2_1_46_1","volume-title":"Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer. In International Conference on Learning Representations.","author":"Shazeer Noam","year":"2016","unstructured":"Noam Shazeer, Azalia Mirhoseini, Krzysztof Maziarz, Andy Davis, Quoc Le, Geoffrey Hinton, and Jeff Dean. 2016. Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548025"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.39"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6431"},{"key":"e_1_3_2_1_50_1","volume-title":"Proceedings of the conference. Association for Computational Linguistics. Meeting","volume":"2019","author":"Hubert Tsai Yao-Hung","year":"2019","unstructured":"Yao-Hung Hubert Tsai, Shaojie Bai, Paul Pu Liang, J Zico Kolter, Louis-Philippe Morency, and Ruslan Salakhutdinov. 2019. Multimodal transformer for unaligned multimodal language sequences. In Proceedings of the conference. Association for Computational Linguistics. Meeting, Vol. 2019. NIH Public Access, 6558."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1443"},{"key":"e_1_3_2_1_52_1","volume-title":"Integrating auxiliary information in self-supervised learning. arXiv preprint arXiv:2106.02869","author":"Hubert Tsai Yao-Hung","year":"2021","unstructured":"Yao-Hung Hubert Tsai, Tianqin Li, Weixin Liu, Peiyuan Liao, Ruslan Salakhutdinov, and Louis-Philippe Morency. 2021. Integrating auxiliary information in self-supervised learning. arXiv preprint arXiv:2106.02869 (2021)."},{"key":"e_1_3_2_1_53_1","volume-title":"Learning Factorized Multimodal Representations. In International Conference on Representation Learning.","author":"Hubert Tsai Yao-Hung","year":"2019","unstructured":"Yao-Hung Hubert Tsai, Paul Pu Liang, Amir Zadeh, Louis-Philippe Morency, and Ruslan Salakhutdinov. 2019. Learning Factorized Multimodal Representations. In International Conference on Representation Learning."},{"key":"e_1_3_2_1_54_1","article-title":"Visualizing data using t-SNE","volume":"9","author":"der Maaten Laurens Van","year":"2008","unstructured":"Laurens Van der Maaten and Geoffrey Hinton. 2008. Visualizing data using t-SNE. Journal of machine learning research 9, 11 (2008).","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_55_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33017216"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i12.17289"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D17-1115"},{"key":"e_1_3_2_1_59_1","volume-title":"Mosi: multimodal corpus of sentiment intensity and subjectivity analysis in online opinion videos. arXiv preprint arXiv:1606.06259","author":"Zadeh Amir","year":"2016","unstructured":"Amir Zadeh, Rowan Zellers, Eli Pincus, and Louis-Philippe Morency. 2016. Mosi: multimodal corpus of sentiment intensity and subjectivity analysis in online opinion videos. arXiv preprint arXiv:1606.06259 (2016)."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1208"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-emnlp.109"},{"key":"e_1_3_2_1_62_1","volume-title":"Multimodal reaction: Information modulation for cross-modal representation learning","author":"Zeng Ying","year":"2023","unstructured":"Ying Zeng, Sijie Mai,Wenjun Yan, and Haifeng Hu. 2023. Multimodal reaction: Information modulation for cross-modal representation learning. IEEE Transactions on Multimedia (2023)."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746682"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2016.2603342"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611974"},{"key":"e_1_3_2_1_66_1","volume-title":"Designing Effective Sparse Expert Models. In 2022 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW). IEEE, 1044--1044","author":"Zoph Barret","year":"2022","unstructured":"Barret Zoph. 2022. Designing Effective Sparse Expert Models. In 2022 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW). IEEE, 1044--1044."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681527","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681527","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:57:48Z","timestamp":1750294668000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681527"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":66,"alternative-id":["10.1145\/3664647.3681527","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681527","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}