{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:14:18Z","timestamp":1765340058404,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","funder":[{"name":"the Innovation Team Project of Guangdong Province of China","award":["No. 2024KCXTD017"],"award-info":[{"award-number":["No. 2024KCXTD017"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755691","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:51Z","timestamp":1761377211000},"page":"5794-5803","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["MGHFT: Multi-Granularity Hierarchical Fusion Transformer for Cross-Modal Sticker Emotion Recognition"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4570-2271","authenticated-orcid":false,"given":"Jian","family":"Chen","sequence":"first","affiliation":[{"name":"Shenzhen MSU-BIT University, Shenzhen, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-8571-118X","authenticated-orcid":false,"given":"Yuxuan","family":"Hu","sequence":"additional","affiliation":[{"name":"Shenzhen MSU-BIT University, Shenzhen, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0155-8447","authenticated-orcid":false,"given":"Haifeng","family":"Lu","sequence":"additional","affiliation":[{"name":"Shenzhen MSU-BIT University, Shenzhen, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1717-5785","authenticated-orcid":false,"given":"Wei","family":"Wang","sequence":"additional","affiliation":[{"name":"Shenzhen MSU-BIT University, Shenzhen, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7345-5071","authenticated-orcid":false,"given":"Min","family":"Yang","sequence":"additional","affiliation":[{"name":"Shenzhen Institutes of Advanced Technology, Chinese Academy of Sciences, Chinese Academy of Sciences, Shenzhen, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4592-3875","authenticated-orcid":false,"given":"Chengming","family":"Li","sequence":"additional","affiliation":[{"name":"Shenzhen MSU-BIT University, Shenzhen, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4952-699X","authenticated-orcid":false,"given":"Xiping","family":"Hu","sequence":"additional","affiliation":[{"name":"Shenzhen MSU-BIT University, Shenzhen, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Recognition-by-components: a theory of human image understanding. Psychological review","author":"Biederman Irving","year":"1987","unstructured":"Irving Biederman. 1987. Recognition-by-components: a theory of human image understanding. Psychological review, Vol. 94, 2 (1987), 115."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3589334.3648145"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3274299"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681522"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680781"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.chb.2007.04.004"},{"key":"e_1_3_2_1_7_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. CoRR","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. CoRR, Vol. abs\/1810.04805 (2018). arXiv:1810.04805 http:\/\/arxiv.org\/abs\/1810.04805"},{"key":"e_1_3_2_1_8_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_9_1","volume-title":"Proceedings of De-Factify: Workshop on Multimodal Fact Checking and Hate Speech Detection, CEUR.","author":"Duan Baishan","year":"2022","unstructured":"Baishan Duan and Yuesheng Zhu. 2022. BROWALLIA at Memotion 2.0 2022: Multimodal memotion analysis with modified ogb strategies. In Proceedings of De-Factify: Workshop on Multimodal Fact Checking and Hate Speech Detection, CEUR."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01172"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Susan Herring and Ashley Dainas. 2017. ''Nice picture comment!'' Graphicons in Facebook comment threads. (2017).","DOI":"10.24251\/HICSS.2017.264"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3627673.3679687"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.243"},{"key":"e_1_3_2_1_15_1","unstructured":"Aaron Hurst Adam Lerer Adam P Goucher Adam Perelman Aditya Ramesh Aidan Clark AJ Ostrow Akila Welihinda Alan Hayes Alec Radford et al. 2024. Gpt-4o system card. arXiv preprint arXiv:2410.21276 (2024)."},{"key":"e_1_3_2_1_16_1","volume-title":"Memeguard: An llm and vlm-based framework for advancing content moderation via meme intervention. arXiv preprint arXiv:2406.05344","author":"Jha Prince","year":"2024","unstructured":"Prince Jha, Raghav Jain, Konika Mandal, Aman Chadha, Sriparna Saha, and Pushpak Bhattacharyya. 2024. Memeguard: An llm and vlm-based framework for advancing content moderation via meme intervention. arXiv preprint arXiv:2406.05344 (2024)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-021-11298-w"},{"key":"e_1_3_2_1_18_1","volume-title":"A brief review of facial emotion recognition based on visual information. sensors","author":"Byoung Chul Ko.","year":"2018","unstructured":"Byoung Chul Ko. 2018. A brief review of facial emotion recognition based on visual information. sensors, Vol. 18, 2 (2018), 401."},{"volume-title":"Reading images: The grammar of visual design","author":"Kress Gunther","key":"e_1_3_2_1_19_1","unstructured":"Gunther Kress and Theo Van Leeuwen. 2020. Reading images: The grammar of visual design. Routledge."},{"key":"e_1_3_2_1_20_1","volume-title":"M3Hop-CoT: Misogynous Meme Identification with Multimodal Multi-hop Chain-of-Thought. arXiv preprint arXiv:2410.09220","author":"Kumari Gitanjali","year":"2024","unstructured":"Gitanjali Kumari, Kirtan Jain, and Asif Ekbal. 2024. M3Hop-CoT: Misogynous Meme Identification with Multimodal Multi-hop Chain-of-Thought. arXiv preprint arXiv:2410.09220 (2024)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/2957265.2961858"},{"key":"e_1_3_2_1_22_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730-19742."},{"key":"e_1_3_2_1_23_1","volume-title":"Reply with Sticker: New Dataset and Model for Sticker Retrieval. arXiv preprint arXiv:2403.05427","author":"Liang Bin","year":"2024","unstructured":"Bin Liang, Bingbing Wang, Zhixin Bai, Qiwei Lang, Mingwei Sun, Kaiheng Hou, Lanjun Zhou, Ruifeng Xu, and Kam-Fai Wong. 2024. Reply with Sticker: New Dataset and Model for Sticker Retrieval. arXiv preprint arXiv:2403.05427 (2024)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3589334.3645381"},{"key":"e_1_3_2_1_25_1","unstructured":"Haotian Liu Chunyuan Li Yuheng Li Bo Li Yuanhan Zhang Sheng Shen and Yong Jae Lee. 2024. LLaVA-NeXT: Improved reasoning OCR and world knowledge. https:\/\/llava-vl.github.io\/blog\/2024-01-30-llava-next\/"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548407"},{"key":"e_1_3_2_1_27_1","volume-title":"ELEMO: Elements Focused Emotion Recognition for Sticker Images. In Chinese Conference on Pattern Recognition and Computer Vision (PRCV). Springer, 231-245","author":"Luo Min","year":"2024","unstructured":"Min Luo, Boda Lin, Binghao Tang, Haolong Yan, and Si Li. 2024. ELEMO: Elements Focused Emotion Recognition for Sticker Images. In Chinese Conference on Pattern Recognition and Computer Vision (PRCV). Springer, 231-245."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.1184"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.sigdial-1.54"},{"key":"e_1_3_2_1_30_1","volume-title":"International conference on machine learning. PmLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748-8763."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i8.26166"},{"volume-title":"Memes in digital culture","author":"Shifman Limor","key":"e_1_3_2_1_32_1","unstructured":"Limor Shifman. 2013. Memes in digital culture. MIT press."},{"key":"e_1_3_2_1_33_1","volume-title":"Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556","author":"Simonyan Karen","year":"2014","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)."},{"key":"e_1_3_2_1_34_1","volume-title":"International conference on machine learning. PMLR, 6105-6114","author":"Tan Mingxing","year":"2019","unstructured":"Mingxing Tan and Quoc Le. 2019. Efficientnet: Rethinking model scaling for convolutional neural networks. In International conference on machine learning. PMLR, 6105-6114."},{"key":"e_1_3_2_1_35_1","volume-title":"International journal of communication","author":"Tang Ying","year":"2019","unstructured":"Ying Tang and Khe Foon Hew. 2019. Emoticon, emoji, and sticker use in computer-mediated communication: A review of theories and research findings. International journal of communication, Vol. 13 (2019), 2457-2483."},{"key":"e_1_3_2_1_36_1","article-title":"Visualizing data using t-SNE","volume":"9","author":"der Maaten Laurens Van","year":"2008","unstructured":"Laurens Van der Maaten and Geoffrey Hinton. 2008. Visualizing data using t-SNE. Journal of machine learning research, Vol. 9, 11 (2008).","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2024.111778"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1177\/0894439315590209"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680987"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3532019"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681060"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01864"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00791"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681280"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v30i1.9987"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612560"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-023-16081-7"},{"key":"e_1_3_2_1_49_1","volume-title":"Stickerconv: generating multimodal empathetic responses from scratch. arXiv preprint arXiv:2402.01679","author":"Zhang Yiqun","year":"2024","unstructured":"Yiqun Zhang, Fanheng Kong, Peidong Wang, Shuang Sun, Lingshuai Wang, Shi Feng, Daling Wang, Yifei Zhang, and Kaisong Song. 2024a. Stickerconv: generating multimodal empathetic responses from scratch. arXiv preprint arXiv:2402.01679 (2024)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3351062"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3094362"},{"key":"e_1_3_2_1_52_1","volume-title":"Multi-Granular Multimodal Clue Fusion for Meme Understanding. arXiv preprint arXiv:2503.12560","author":"Zheng Li","year":"2025","unstructured":"Li Zheng, Hao Fei, Ting Dai, Zuquan Peng, Fei Li, Huisheng Ma, Chong Teng, and Donghong Ji. 2025. Multi-Granular Multimodal Clue Fusion for Meme Understanding. arXiv preprint arXiv:2503.12560 (2025)."},{"key":"e_1_3_2_1_53_1","volume-title":"Places: A 10 million image database for scene recognition","author":"Zhou Bolei","year":"2017","unstructured":"Bolei Zhou, Agata Lapedriza, Aditya Khosla, Aude Oliva, and Antonio Torralba. 2017. Places: A 10 million image database for scene recognition. IEEE transactions on pattern analysis and machine intelligence, Vol. 40, 6 (2017), 1452-1464."},{"key":"e_1_3_2_1_54_1","volume-title":"Proceedings of De-Factify: Workshop on Multimodal Fact Checking and Hate Speech Detection, CEUR.","author":"Zhuang Yan","year":"2022","unstructured":"Yan Zhuang and Yanru Zhang. 2022. Yet at Memotion 2.0 2022: Hate speech detection combining bilstm and fully connected layers. In Proceedings of De-Factify: Workshop on Multimodal Fact Checking and Hate Speech Detection, CEUR."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755691","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:09:27Z","timestamp":1765339767000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755691"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":54,"alternative-id":["10.1145\/3746027.3755691","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755691","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}