{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T18:20:47Z","timestamp":1776882047103,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":66,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3758202","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:39:06Z","timestamp":1761377946000},"page":"12651-12658","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Genesis: A Large-Scale Benchmark for Multimodal Large Language Model in Emotional Causality Analysis"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-8297-1129","authenticated-orcid":false,"given":"Yulong","family":"Li","sequence":"first","affiliation":[{"name":"Mohamed bin Zayed University of Artificial Intelligence, Abu Dhabi, United Arab Emirates"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-7988-4228","authenticated-orcid":false,"given":"Yuxuan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong-Liverpool University, Suzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-5692-8940","authenticated-orcid":false,"given":"Rui","family":"Chen","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong-Liverpool University, Suzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-7583-482X","authenticated-orcid":false,"given":"Feilong","family":"Tang","sequence":"additional","affiliation":[{"name":"Monash University, Clayton, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2461-6646","authenticated-orcid":false,"given":"Zhixiang","family":"Lu","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong-Liverpool University, Suzhou, Jiang Su, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-0989-6015","authenticated-orcid":false,"given":"Ming","family":"Hu","sequence":"additional","affiliation":[{"name":"Monash University, Melbourne, VIC, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9743-9316","authenticated-orcid":false,"given":"Jianghao","family":"Wu","sequence":"additional","affiliation":[{"name":"Monash University, Melbourne, VIC, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-4054-9297","authenticated-orcid":false,"given":"Haochen","family":"Xue","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong-Liverpool University, Suzhou, Jiang Su, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2514-4433","authenticated-orcid":false,"given":"Mian","family":"Zhou","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong-Liverpool University, Suzhou, Jiang Su, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9382-4727","authenticated-orcid":false,"given":"Chong","family":"Li","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong-Liverpool University, Suzhou, Jiang Su, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5360-6493","authenticated-orcid":false,"given":"Jionglong","family":"Su","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong-Liverpool University, Suzhou, Jiang Su, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3930-6600","authenticated-orcid":false,"given":"Imran","family":"Razzak","sequence":"additional","affiliation":[{"name":"Mohamed bin Zayed University of Artificial Intelligence, Abu Dhabi, United Arab Emirates"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"DeepSeek AI. 2024. DeepSeek-V3 Technical Report. arXiv preprint arXiv:2412.19437 (2024)."},{"key":"e_1_3_2_1_2_1","unstructured":"Meta AI. 2024. Llama 3.2--Vision: Multimodal Large Language Model (11B). https:\/\/huggingface.co\/meta-llama\/Llama-3.2--11B-Vision. Model card."},{"key":"e_1_3_2_1_3_1","unstructured":"THUDM AI. 2024. GLM-4V-9B: An Open Multimodal Version of the GLM-4 Series. https:\/\/huggingface.co\/THUDM\/glm-4v-9b. Model card."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18--1208"},{"key":"e_1_3_2_1_5_1","unstructured":"Yuntao Bai Andy Jones Kamal Ndousse Amanda Askell Anna Chen Nova DasSarma Dawn Drain Stanislav Fort Deep Ganguli Tom Henighan Nicholas Joseph Saurav Kadavath Jackson Kernion Tom Conerly Sheer El-Showk Nelson Elhage Zac Hatfield-Dodds Danny Hernandez Tristan Hume Scott Johnston Shauna Kravec Liane Lovitt Neel Nanda Catherine Olsson Dario Amodei Tom Brown Jack Clark Sam McCandlish Chris Olah Ben Mann and Jared Kaplan. 2022. Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback. arXiv:2204.05862 [cs.CL] https:\/\/arxiv.org\/abs\/2204.05862"},{"key":"e_1_3_2_1_6_1","volume-title":"Proceedings of the 45th Annual Meeting of the Association of Computational Linguistics, Annie Zaenen and Antal van den Bosch (Eds.)","author":"Blitzer John","unstructured":"John Blitzer, Mark Dredze, and Fernando Pereira. 2007. Biographies, Bollywood, Boom-boxes and Blenders: Domain Adaptation for Sentiment Classification. In Proceedings of the 45th Annual Meeting of the Association of Computational Linguistics, Annie Zaenen and Antal van den Bosch (Eds.). Association for Computational Linguistics, Prague, Czech Republic, 440--447. https:\/\/aclanthology.org\/P07--1056\/"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10579-008--9076--6"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.29"},{"key":"e_1_3_2_1_9_1","unstructured":"Google DeepMind. 2024. Gemini 1.5 Pro: Long-Context Multimodal Model. https:\/\/blog.google\/technology\/ai\/google-gemini-next-generationmodel-february-2024\/. See accompanying technical report PDF."},{"key":"e_1_3_2_1_10_1","unstructured":"DeepSeek-AI Aixin Liu Bei Feng and Bing Xue et al. 2025. DeepSeek-V3 Technical Report. arXiv:2412.19437 [cs.CL] https:\/\/arxiv.org\/abs\/2412.19437"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"Dorottya Demszky Dana Movshovitz-Attias Jeongwoo Ko Alan Cowen Gaurav Nemade and Sujith Ravi. 2020. GoEmotions: A Dataset of Fine-Grained Emotions. arXiv:2005.00547 [cs.CL] https:\/\/arxiv.org\/abs\/2005.00547","DOI":"10.18653\/v1\/2020.acl-main.372"},{"key":"e_1_3_2_1_12_1","volume-title":"Acted Facial Expressions In The Wild Database. (10","author":"Dhall Abhinav","year":"2011","unstructured":"Abhinav Dhall, Roland Goecke, Simon Lucey, and Tom Gedeon. 2011. Acted Facial Expressions In The Wild Database. (10 2011)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/MMUL.2012.26"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19--1259"},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the NTCIR-13 Conference. National Institute of Informatics, 1--10","author":"Gao Qinghong","year":"2017","unstructured":"Qinghong Gao, Jiannan Hu, Ruifeng Xu, Lin Gui, Yulan He, Kam-Fai Wong, and Qin Lu. 2017. Overview of NTCIR-13 ECA Task. In Proceedings of the NTCIR-13 Conference. National Institute of Informatics, 1--10."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.224"},{"key":"e_1_3_2_1_17_1","unstructured":"Team GLM : Aohan Zeng Bin Xu Bowen Wang et al. 2024. ChatGLM: A Family of Large Language Models from GLM-130B to GLM-4 All Tools. arXiv:2406.12793 [cs.CL] https:\/\/arxiv.org\/abs\/2406.12793"},{"key":"e_1_3_2_1_18_1","volume-title":"Deliberative Alignment: Reasoning Enables Safer Language Models. arXiv:2412.16339 [cs.CL] https: \/\/arxiv.org\/abs\/2412.16339","author":"Guan Melody Y.","year":"2025","unstructured":"Melody Y. Guan, Manas Joglekar, EricWallace, et al. 2025. Deliberative Alignment: Reasoning Enables Safer Language Models. arXiv:2412.16339 [cs.CL] https: \/\/arxiv.org\/abs\/2412.16339"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N18-1193"},{"key":"e_1_3_2_1_20_1","unstructured":"Zhaopei Huang Jinming Zhao and Qin Jin. 2024. ECR-Chain: Advancing Generative Language Models to Better Emotion-Cause Reasoners through Reasoning Chains. arXiv:2405.10860 [cs.CL] https:\/\/arxiv.org\/abs\/2405.10860"},{"key":"e_1_3_2_1_21_1","volume-title":"Alex Tachard Passos, et al","author":"Jaech Aaron","year":"2024","unstructured":"Aaron Jaech, Adam Lerer, Aiden Low, Alex Carney, Alex Tachard Passos, et al. 2024. Learning to Reason with Large Language Models. Technical Report. OpenAI. https:\/\/openai.com\/index\/learning-to-reason-with-llms\/"},{"key":"e_1_3_2_1_22_1","unstructured":"Shanghai AI Laboratory. 2025. InternLM 2.5--20B: Long-Context Instruction Model. https:\/\/huggingface.co\/internlm\/internlm2_5--20b."},{"key":"e_1_3_2_1_23_1","unstructured":"Shanghai AI Laboratory. 2025. InternLM 3--8B-Instruct: General-Purpose Instruction Model. https:\/\/huggingface.co\/internlm\/internlm3--8b-instruct."},{"key":"e_1_3_2_1_24_1","volume-title":"Proceedings of the NAACL HLT 2010 Workshop on Computational Approaches to Analysis and Generation of Emotion in Text","author":"Mei Lee Sophia Yat","unstructured":"Sophia Yat Mei Lee, Ying Chen, and Chu-Ren Huang. 2010. A Text-driven Rulebased System for Emotion Cause Detection. In Proceedings of the NAACL HLT 2010 Workshop on Computational Approaches to Analysis and Generation of Emotion in Text, Diana Inkpen and Carlo Strapparava (Eds.). Association for Computational Linguistics, Los Angeles, CA, 45--53. https:\/\/aclanthology.org\/W10-0206\/"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.849"},{"key":"e_1_3_2_1_26_1","unstructured":"Bo Li Yuanhan Zhang et al. 2024. LLaVA-OneVision: Easy Visual Task Transfer. arXiv preprint arXiv:2408.03326 (2024)."},{"key":"e_1_3_2_1_27_1","unstructured":"Feng Li Renrui Zhang et al. 2024. LLaVA-NeXT-Interleave: Tackling Multi-image Video and 3D in Large Multimodal Models. arXiv preprint arXiv:2407.07895 (2024)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2022.3216551"},{"key":"e_1_3_2_1_29_1","unstructured":"Yulong Li Zhixiang Lu Feilong Tang Simin Lai Ming Hu Yuxuan Zhang Haochen Xue Zhaodong Wu Imran Razzak Qingxia Li et al. 2025. Rhythm of Opinion: A Hawkes-Graph Framework for Dynamic Propagation Analysis. arXiv preprint arXiv:2504.15072 (2025)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i27.35037"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"Yulong Li BoqianWang and Jionglong Su. 2024. GP-PAIL: Generative Adversarial Imitation Learning in Massive-Agent Environments. (2024) 314--322.","DOI":"10.1109\/BDAI62182.2024.10692671"},{"key":"e_1_3_2_1_32_1","volume-title":"Beyond words: Auralllm and signmst-c for precise sign language production and bidirectional accessibility. arXiv preprint arXiv:2501.00765","author":"Li Yulong","year":"2025","unstructured":"Yulong Li, Yuxuan Zhang, Feilong Tang, Mian Zhou, Zhixiang Lu, Haochen Xue, Yifang Wang, Kang Dang, and Jionglong Su. 2025. Beyond words: Auralllm and signmst-c for precise sign language production and bidirectional accessibility. arXiv preprint arXiv:2501.00765 (2025)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3049898"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2025.126924"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"crossref","unstructured":"Meng Luo Hao Fei Bobo Li et al. 2024. PanoSent: A Panoptic Sextuple Extraction Benchmark for Multimodal Conversational Aspect-based Sentiment Analysis. arXiv:2408.09481 [cs.CL] https:\/\/arxiv.org\/abs\/2408.09481","DOI":"10.1145\/3664647.3680705"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1066"},{"key":"e_1_3_2_1_37_1","unstructured":"Navonil Majumder Soujanya Poria et al. 2019. DialogueRNN: An Attentive RNN for Emotion Detection in Conversations. arXiv:1811.00405 [cs.CL] https:\/\/arxiv.org\/abs\/1811.00405"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i02.5492"},{"key":"e_1_3_2_1_39_1","volume-title":"Goucher et al","author":"AI","year":"2024","unstructured":"OpenAI, :, Aaron Hurst, Adam Lerer, and Adam P. Goucher et al. 2024. GPT-4o System Card. arXiv:2410.21276 [cs.CL] https:\/\/arxiv.org\/abs\/2410.21276"},{"key":"e_1_3_2_1_40_1","unstructured":"OpenAI. 2024. OpenAI o1-mini. https:\/\/openai.com\/index\/openai-o1-miniadvancing-cost-efficient-reasoning\/"},{"key":"e_1_3_2_1_41_1","unstructured":"OpenGVLab. 2025. InternVL 2.5-MPO: An Advanced Multimodal Large Language Model. https:\/\/huggingface.co\/OpenGVLab\/InternVL2_5--78B-MPO."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.2307\/2074241"},{"key":"e_1_3_2_1_43_1","unstructured":"Haiyun Peng Lu Xu Lidong Bing Fei Huang Wei Lu and Luo Si. 2019. Knowing What How and Why: A Near Complete Solution for Aspect-based Sentiment Analysis. arXiv:1911.01616 [cs.CL] https:\/\/arxiv.org\/abs\/1911.01616"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.5555\/265013"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/S14-2004"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1050"},{"key":"e_1_3_2_1_47_1","unstructured":"Soujanya Poria Navonil Majumder Devamanyu Hazarika et al. 2020. Recognizing Emotion Cause in Conversations. arXiv preprint arXiv:2012.11820 (2020)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1534"},{"key":"e_1_3_2_1_49_1","volume-title":"ATOMIC: An Atlas of Machine Commonsense for If-Then Reasoning. arXiv:1811.00146 [cs.CL] https:\/\/arxiv.org\/abs\/1811.00146","author":"Maarten Sap","year":"2019","unstructured":"Maarten Sap et al. 2019. ATOMIC: An Atlas of Machine Commonsense for If-Then Reasoning. arXiv:1811.00146 [cs.CL] https:\/\/arxiv.org\/abs\/1811.00146"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.123"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","unstructured":"Richard Socher Alex Perelygin et al. 2013. Recursive Deep Models for Semantic Compositionality Over a Sentiment Treebank. In Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing David Yarowsky Timothy Baldwin Anna Korhonen Karen Livescu and Steven Bethard (Eds.). Association for Computational Linguistics Seattle Washington USA 1631--1642. https:\/\/aclanthology.org\/D13--1170\/","DOI":"10.18653\/v1\/D13-1170"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D15-1167"},{"key":"e_1_3_2_1_53_1","unstructured":"LLaVATeam. 2024. LLaVA-NeXT-Video-32B-Qwen. https:\/\/huggingface.co\/lmmslab\/LLaVA-NeXT-Video-32B-Qwen."},{"key":"e_1_3_2_1_54_1","unstructured":"Qwen Team. 2025. Qwen2.5-VL: Instruction-Tuned Vision--Language Model (72B). https:\/\/huggingface.co\/Qwen\/Qwen2.5-VL-72B-Instruct. Technical report and model card."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-acl.348"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.semeval-1.277"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.261"},{"key":"e_1_3_2_1_58_1","unstructured":"An Yang et al. 2024. Qwen 2.5 Technical Report. arXiv preprint arXiv:2412.15115 (2024)."},{"key":"e_1_3_2_1_59_1","unstructured":"Zhou Yang et al. 2024. LongVILA: Scaling Long-Context Visual--Language Models. arXiv preprint arXiv:2408.10188 (2024)."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","unstructured":"Wenmeng Yu et al. 2020. CH-SIMS: A Chinese Multimodal Sentiment Analysis Dataset with Fine-grained Annotation of Modality. In Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics Dan Jurafsky Joyce Chai Natalie Schluter and Joel Tetreault (Eds.). Association for Computational Linguistics Online 3718--3727. doi:10.18653\/v1\/2020.acl-main.343","DOI":"10.18653\/v1\/2020.acl-main.343"},{"key":"e_1_3_2_1_61_1","unstructured":"Kaixiang Zhang et al. 2024. Oryx-1.5: Scaling Visual Instruction Tuning with Qwen-2.5. arXiv preprint arXiv:2409.12961 (2024)."},{"key":"e_1_3_2_1_62_1","unstructured":"Yuanhan Zhang et al. 2024. Video Instruction Tuning with Synthetic Data. arXiv preprint arXiv:2410.02713 (2024). Introduces LLaVA-Video-7B-Qwen2."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"crossref","unstructured":"Yuxuan Zhang Yulong Li Zichen Yu Feilong Tang Zhixiang Lu Chong Li Kang Dang and Jionglong Su. 2025. Decoding the Flow: CauseMotion for Emotional Causality Analysis in Long-form Conversations. arXiv:2501.00778 [cs.CL] https:\/\/arxiv.org\/abs\/2501.00778","DOI":"10.1109\/AVSS65446.2025.11149922"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.391"},{"key":"e_1_3_2_1_65_1","unstructured":"Weixiang Zhao Yanyan Zhao Zhuojun Li and Bing Qin. 2022. Knowledge-Bridged Causal Interaction Network for Causal Emotion Entailment. arXiv:2212.02995 [cs.CL] https:\/\/arxiv.org\/abs\/2212.02995"},{"key":"e_1_3_2_1_66_1","unstructured":"Fan Zheng Zhiwei Liu et al. 2025. Phi-4: A Multimodal Model Integrating Text Vision and Speech. arXiv preprint arXiv:2503.01743 (2025)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3758202","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:58:10Z","timestamp":1765310290000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3758202"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":66,"alternative-id":["10.1145\/3746027.3758202","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3758202","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}