{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:55:56Z","timestamp":1781535356881,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":34,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"the Ministry of education of Humanitiesand Social Science project, China","award":["23YJE740002"],"award-info":[{"award-number":["23YJE740002"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810673","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"1495-1503","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Query-Guided Conflict Inference and Incongruity-Aware Alignment for Implicit Hate Speech Detection in Videos"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-6550-8840","authenticated-orcid":false,"given":"Shuo","family":"Liu","sequence":"first","affiliation":[{"name":"Jianghan University, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-3183-2576","authenticated-orcid":false,"given":"Jiakang","family":"Yu","sequence":"additional","affiliation":[{"name":"Jianghan University, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5143-6774","authenticated-orcid":false,"given":"Xun","family":"Zhu","sequence":"additional","affiliation":[{"name":"Jianghan University, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6910-499X","authenticated-orcid":false,"given":"Hongtao","family":"Deng","sequence":"additional","affiliation":[{"name":"Jianghan University, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5500-5982","authenticated-orcid":false,"given":"Yinxia","family":"Lou","sequence":"additional","affiliation":[{"name":"Jianghan University, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"e_1_3_3_2_3_2","unstructured":"Alexei Baevski Yuhao Zhou Abdelrahman Mohamed and Michael Auli. 2020. wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems 33 (2020) 12449\u201312460."},{"key":"e_1_3_3_2_4_2","first-page":"894","volume-title":"Proceedings of the 34th International Conference on Machine Learning (ICML)","author":"Cuturi Marco","year":"2017","unstructured":"Marco Cuturi and Mathieu Blondel. 2017. Soft-dtw: a differentiable loss function for time-series. In Proceedings of the 34th International Conference on Machine Learning (ICML). PMLR, Sydney, NSW, Australia, 894\u2013903."},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"publisher","DOI":"10.1609\/icwsm.v17i1.22209"},{"key":"e_1_3_3_2_6_2","first-page":"4171","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers). Association for Computational Linguistics, Minneapolis, Minnesota, 4171\u20134186."},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"crossref","unstructured":"Chhavi Dixit and Shashank\u00a0Mouli Satapathy. 2024. Deep CNN with late fusion for real time multimodal emotion recognition. Expert Systems with Applications 240 (2024) 122579.","DOI":"10.1016\/j.eswa.2023.122579"},{"key":"e_1_3_3_2_8_2","first-page":"1","volume-title":"International Conference on Learning Representations (ICLR)","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In International Conference on Learning Representations (ICLR). OpenReview.net, Vienna, Austria, 1\u201321."},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.29"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.234"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","DOI":"10.1609\/icwsm.v8i1.14550"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"crossref","unstructured":"Shing-Yun Jung Chia-Hung Liao Yu-Sheng Wu Shyan-Ming Yuan and Chuen-Tsai Sun. 2021. Efficiently classifying lung sounds through depthwise separable CNN models with fused STFT and MFCC features. Diagnostics 11 4 (2021) 732.","DOI":"10.3390\/diagnostics11040732"},{"key":"e_1_3_3_2_14_2","unstructured":"Prannay Khosla Piotr Teterwak Chen Wang Aaron Sarna Yonglong Tian Phillip Isola Aaron Maschinot Ce Liu and Dilip Krishnan. 2020. Supervised contrastive learning. Advances in neural information processing systems 33 (2020) 18661\u201318673."},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"crossref","unstructured":"Ronglu Li Tianyi Zhang and Rubo Zhang. 2024. Weakly supervised temporal action localization: a survey. Multimedia Tools and Applications 83 32 (2024) 78361\u201378386.","DOI":"10.1007\/s11042-024-18554-9"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"crossref","unstructured":"Yangyang Li Yuelin Li Shihuai Zhang Guangyuan Liu Yanqiao Chen Ronghua Shang and Licheng Jiao. 2024. An attention-based context-aware multimodal fusion method for sarcasm detection using inter-modality inconsistency. Knowledge-Based Systems 287 (2024) 111457.","DOI":"10.1016\/j.knosys.2024.111457"},{"key":"e_1_3_3_2_17_2","first-page":"13","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks. In Advances in Neural Information Processing Systems (NeurIPS) , Vol.\u00a032. Curran Associates, Inc., Vancouver, BC, Canada, 13\u201323."},{"key":"e_1_3_3_2_18_2","first-page":"720","volume-title":"Proceedings of the 20th International Conference on Natural Language Processing (ICON)","author":"Mandal Atanu","year":"2023","unstructured":"Atanu Mandal, Gargi Roy, Amit Barman, Indranil Dutta, and Sudip\u00a0Kumar Naskar. 2023. Attentive Fusion: A Transformer-based Approach to Multimodal Hate Speech Detection. In Proceedings of the 20th International Conference on Natural Language Processing (ICON). Association for Computational Linguistics, Goa, India, 720\u2013728."},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"crossref","unstructured":"Saif\u00a0M Mohammad and Peter\u00a0D Turney. 2013. Crowdsourcing a word\u2013emotion association lexicon. Computational intelligence 29 3 (2013) 436\u2013465.","DOI":"10.1111\/j.1467-8640.2012.00460.x"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.124"},{"key":"e_1_3_3_2_21_2","volume-title":"Improving Language Understanding by Generative Pre-Training","author":"Radford Alec","year":"2018","unstructured":"Alec Radford, Karthik Narasimhan, Tim Salimans, and Ilya Sutskever. 2018. Improving Language Understanding by Generative Pre-Training. Technical Report. OpenAI, San Francisco, CA, USA."},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.842"},{"key":"e_1_3_3_2_23_2","unstructured":"Qiyue Sun Tailin Chen Yinghui Zhang and Zeyu Fu. 2025. Weakly-supervised Multimodal Hate Content Localisation in Videos. arxiv:https:\/\/arXiv.org\/abs\/2512.10408\u00a0[cs.CV]"},{"key":"e_1_3_3_2_24_2","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar et\u00a0al. 2023. LLaMA: Open and Efficient Foundation Language Models. arxiv:https:\/\/arXiv.org\/abs\/2302.13971\u00a0[cs.CL]"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1656"},{"key":"e_1_3_3_2_26_2","first-page":"5998","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is All You Need. In Advances in Neural Information Processing Systems (NeurIPS) , Vol.\u00a030. Curran Associates, Inc., Long Beach, CA, USA, 5998\u20136008."},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681521"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"crossref","unstructured":"Lan Wang Junjie Peng Cangzhi Zheng Tong Zhao and Li\u2019an Zhu. 2024. A cross modal hierarchical fusion multimodal sentiment analysis method based on multi-task learning. Information Processing & Management 61 3 (2024) 103675.","DOI":"10.1016\/j.ipm.2024.103675"},{"key":"e_1_3_3_2_29_2","first-page":"23318","volume-title":"Proceedings of the 39th International Conference on Machine Learning (ICML)","author":"Wang Peng","year":"2022","unstructured":"Peng Wang, An Yang, Rui Men, Junyang Lin, Shuai Bai, Zhikang Li, Jianxin Ma, Chang Zhou, Jingren Zhou, and Hongxia Yang. 2022. OFA: Unifying Architectures, Tasks, and Modalities Through a Simple Sequence-to-Sequence Learning Framework. In Proceedings of the 39th International Conference on Machine Learning (ICML). PMLR, Baltimore, MD, USA, 23318\u201323340."},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00250"},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i6.28423"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICME59968.2025.11209221"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-95-7081-2_34"},{"key":"e_1_3_3_2_34_2","first-page":"243","volume-title":"Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: System Demonstrations","author":"Zhang Hang","year":"2023","unstructured":"Hang Zhang, Xin Li, and Lidong Bing. 2023. Video-LLaMA: An Instruction-tuned Audio-Visual Language Model for Video Understanding. In Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: System Demonstrations. Association for Computational Linguistics, Singapore, 243\u2013253."},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.705"}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:44:55Z","timestamp":1781534695000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810673"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":34,"alternative-id":["10.1145\/3805622.3810673","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810673","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}