{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T09:08:34Z","timestamp":1765357714150,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":60,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Natural Science Foundation of Tianjin, China","award":["NO.20JCJQJC00020"],"award-info":[{"award-number":["NO.20JCJQJC00020"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680987","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:41Z","timestamp":1729925981000},"page":"9631-9640","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Perceive before Respond: Improving Sticker Response Selection by Emotion Distillation and Hard Mining"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-3947-1367","authenticated-orcid":false,"given":"Wuyou","family":"Xia","sequence":"first","affiliation":[{"name":"VCIP &amp; TMCC &amp; DISSec, College of Computer Science, Nankai University &amp; Nankai International Advanced Research Institute (SHENZHEN-FUTIAN), &amp; Pengcheng Laboratory, Tianjin, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7746-233X","authenticated-orcid":false,"given":"Shengzhe","family":"Liu","sequence":"additional","affiliation":[{"name":"VCIP &amp; TMCC &amp; DISSec, College of Computer Science, Nankai University, Tianjin, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4949-2168","authenticated-orcid":false,"given":"Qin","family":"Rong","sequence":"additional","affiliation":[{"name":"VCIP &amp; TMCC &amp; DISSec, College of Computer Science, Nankai University, Tianjin, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9494-7013","authenticated-orcid":false,"given":"Guoli","family":"Jia","sequence":"additional","affiliation":[{"name":"VCIP &amp; TMCC &amp; DISSec, College of Computer Science, Nankai University, Tianjin, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3177-3538","authenticated-orcid":false,"given":"Eunil","family":"Park","sequence":"additional","affiliation":[{"name":"Sungkyunkwan University, Seoul, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0219-3443","authenticated-orcid":false,"given":"Jufeng","family":"Yang","sequence":"additional","affiliation":[{"name":"VCIP &amp; TMCC &amp; DISSec, College of Computer Science, Nankai University &amp; Nankai International Advanced Research Institute (SHENZHEN-FUTIAN), &amp; Pengcheng Laboratory, Tianjin, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"crossref","unstructured":"Riswanda Al Farisi Ridi Ferdiana and Teguh Bharata Adji. 2022. The effect of anthropomorphic design cues on increasing chatbot empathy. In ICISIT.","DOI":"10.1109\/ICISIT54091.2022.9873008"},{"key":"e_1_3_2_2_2_1","unstructured":"Mingxiao An Fangzhao Wu Chuhan Wu Kun Zhang Zheng Liu and Xing Xie. 2019. Neural news recommendation with long-and short-term user representations. In ACL."},{"key":"e_1_3_2_2_3_1","volume-title":"Kriti Aggarwal, Subhojit Som, Songhao Piao, and Furu Wei.","author":"Bao Hangbo","year":"2022","unstructured":"Hangbo Bao, Wenhui Wang, Li Dong, Qiang Liu, Owais Khan Mohammed, Kriti Aggarwal, Subhojit Som, Songhao Piao, and Furu Wei. 2022. Vlmo: Unified vision-language pre-training with mixture-of-modality-experts. NeurIPS (2022)."},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2018.2862363"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.5555\/3100515.3100521"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2019.2947789"},{"key":"e_1_3_2_2_7_1","volume-title":"Dzmitry Bahdanau, Fethi Bougares, Holger Schwenk, and Yoshua Bengio.","author":"Cho Kyunghyun","year":"2014","unstructured":"Kyunghyun Cho, Bart van Merrienboer, cCaglar G\u00fclccehre, Dzmitry Bahdanau, Fethi Bougares, Holger Schwenk, and Yoshua Bengio. 2014. Learning phrase representations using RNN encoder-decoder for statistical machine translation. In EMNLP."},{"key":"e_1_3_2_2_8_1","volume-title":"Imagenet: A large-scale hierarchical image database. In CVPR.","author":"Deng Jia","year":"2009","unstructured":"Jia Deng, Wei Dong, Richard Socher, Li-Jia Li, Kai Li, and Li Fei-Fei. 2009. Imagenet: A large-scale hierarchical image database. In CVPR."},{"key":"e_1_3_2_2_9_1","volume-title":"BERT: Pre-training of deep bidirectional transformers for language understanding. In NAACL.","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of deep bidirectional transformers for language understanding. In NAACL."},{"key":"e_1_3_2_2_10_1","volume-title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. ICLR","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. ICLR (2021)."},{"key":"e_1_3_2_2_11_1","volume-title":"Towards expressive communication with internet memes: A new multimodal conversation dataset and benchmark. arXiv preprint arXiv:2109.01839","author":"Fei Zhengcong","year":"2021","unstructured":"Zhengcong Fei, Zekang Li, Jinchao Zhang, Yang Feng, and Jie Zhou. 2021. Towards expressive communication with internet memes: A new multimodal conversation dataset and benchmark. arXiv preprint arXiv:2109.01839 (2021)."},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"crossref","unstructured":"Shen Gao Xiuying Chen Chang Liu Li Liu Dongyan Zhao and Rui Yan. 2020. Learning to respond with stickers: A framework of unifying multi-modality in multi-turn dialog. In WWW.","DOI":"10.1145\/3366423.3380191"},{"key":"e_1_3_2_2_13_1","first-page":"1","article-title":"Learning to respond with your favorite stickers: A framework of unifying multi-modality and user preference in multi-turn dialog","volume":"39","author":"Gao Shen","year":"2021","unstructured":"Shen Gao, Xiuying Chen, Li Liu, Dongyan Zhao, and Rui Yan. 2021. Learning to respond with your favorite stickers: A framework of unifying multi-modality and user preference in multi-turn dialog. ACM Transactions on Information Systems, Vol. 39 (2021), 1--32.","journal-title":"ACM Transactions on Information Systems"},{"key":"e_1_3_2_2_14_1","unstructured":"Feng Ge Weizhao Li Haopeng Ren and Yi Cai. 2022. Towards exploiting sticker for multimodal sentiment analysis in social media: A new dataset and baseline. In COLING."},{"key":"e_1_3_2_2_15_1","volume-title":"Gelbukh","author":"Ghosal Deepanway","year":"2019","unstructured":"Deepanway Ghosal, Navonil Majumder, Soujanya Poria, Niyati Chhaya, and Alexander F. Gelbukh. 2019. DialogueGCN: A graph convolutional neural network for emotion recognition in conversation. In EMNLP-IJCNLP."},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"crossref","unstructured":"Dalu Guo Chang Xu and Dacheng Tao. 2019. Image-question-answer synergistic network for visual dialog. In CVPR.","DOI":"10.1109\/CVPR.2019.01068"},{"key":"e_1_3_2_2_17_1","volume-title":"An overview of chatbot-based mobile mental health apps: insights from app description and user reviews. JMIR mHealth and uHealth","author":"Romael Haque MD","year":"2023","unstructured":"MD Romael Haque and Sabirat Rubya. 2023. An overview of chatbot-based mobile mental health apps: insights from app description and user reviews. JMIR mHealth and uHealth, Vol. 11 (2023), e44838."},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.2307\/2346830"},{"key":"e_1_3_2_2_19_1","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep residual learning for image recognition. In CVPR."},{"key":"e_1_3_2_2_20_1","unstructured":"Geoffrey Hinton Oriol Vinyals Jeff Dean et al. 2015. Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531 (2015)."},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"crossref","unstructured":"Taichi Ishiwatari Yuki Yasuda Taro Miyazaki and Jun Goto. 2020. Relation-aware graph attention networks with relational position encodings for emotion recognition in conversations. In EMNLP.","DOI":"10.18653\/v1\/2020.emnlp-main.597"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1093\/jcmc\/zmaa003"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"crossref","unstructured":"Abhishek Laddha Mohamed Hanoosh Debdoot Mukherjee Parth Patwa and Ankur Narang. 2020. Understanding chat messages for sticker recommendation in messaging apps. In AAAI.","DOI":"10.1609\/aaai.v34i08.7019"},{"key":"e_1_3_2_2_25_1","volume-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In ICML.","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In ICML."},{"key":"e_1_3_2_2_26_1","volume-title":"Align before fuse: Vision and language representation learning with momentum distillation. NeurIPS","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. NeurIPS (2021)."},{"key":"e_1_3_2_2_27_1","unstructured":"Shen Li Zhe Zhao Renfen Hu Wensi Li Tao Liu and Xiaoyong Du. 2018. Analogical reasoning on chinese morphological and semantic relations. In ACL."},{"key":"e_1_3_2_2_28_1","unstructured":"Tianhong Li Jianguo Li Zhuang Liu and Changshui Zhang. 2020. Few sample knowledge distillation for efficient network compression. In CVPR."},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2021.09.057"},{"key":"e_1_3_2_2_30_1","unstructured":"Xiangpeng Li Jingkuan Song Lianli Gao Xianglong Liu Wenbing Huang Xiangnan He and Chuang Gan. 2019. Beyond rnns: Positional self-attention with co-attention for video question answering. In AAAI."},{"key":"e_1_3_2_2_31_1","volume-title":"Reply with Sticker: New Dataset and Model for Sticker Retrieval. arXiv preprint arXiv:2403.05427","author":"Liang Bin","year":"2024","unstructured":"Bin Liang, Bingbing Wang, Zhixin Bai, Qiwei Lang, Mingwei Sun, Kaiheng Hou, Kam-Fai Wong, and Ruifeng Xu. 2024. Reply with Sticker: New Dataset and Model for Sticker Retrieval. arXiv preprint arXiv:2403.05427 (2024)."},{"key":"e_1_3_2_2_32_1","unstructured":"Tsung-Yi Lin Priya Goyal Ross Girshick Kaiming He and Piotr Doll\u00e1r. 2017. Focal loss for dense object detection. In ICCV."},{"key":"e_1_3_2_2_33_1","unstructured":"Shengzhe Liu Xin Zhang and Jufeng Yang. 2022. SER30K: A Large-Scale Dataset for Sticker Emotion Recognition. In ACM MM."},{"key":"e_1_3_2_2_34_1","volume-title":"Towards Building an Open-Domain Dialogue System Incorporated with Internet Memes","author":"Lu Hua","year":"2023","unstructured":"Hua Lu, Zhen Guo, Chanjuan Li, Yunyi Yang, Huang He, and Siqi Bao. 2023. Towards Building an Open-Domain Dialogue System Incorporated with Internet Memes. IEEE\/ACM Transactions on Audio, Speech, and Language Processing (2023)."},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2021.107751"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"crossref","unstructured":"Navonil Majumder Soujanya Poria Devamanyu Hazarika Rada Mihalcea Alexander Gelbukh and Erik Cambria. 2019. DialogueRNN: An attentive rnn for emotion detection in conversations. In AAAI.","DOI":"10.1609\/aaai.v33i01.33016818"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"crossref","unstructured":"Seyed Iman Mirzadeh Mehrdad Farajtabar Ang Li Nir Levine Akihiro Matsukawa and Hassan Ghasemzadeh. 2020. Improved knowledge distillation via teacher assistant. In AAAI.","DOI":"10.1609\/aaai.v34i04.5963"},{"key":"e_1_3_2_2_38_1","volume-title":"Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748","author":"van den Oord Aaron","year":"2018","unstructured":"Aaron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)."},{"key":"e_1_3_2_2_39_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. NeurIPS","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. 2019. Pytorch: An imperative style, high-performance deep learning library. NeurIPS (2019)."},{"key":"e_1_3_2_2_40_1","volume-title":"Mitchell","author":"Platanios Emmanouil Antonios","year":"2019","unstructured":"Emmanouil Antonios Platanios, Otilia Stretcu, Graham Neubig, Barnab\u00e1s P\u00f3czos, and Tom M. Mitchell. 2019. Competence-based Curriculum Learning for Neural Machine Translation. In NAACL-HLT."},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.tele.2018.06.005"},{"key":"e_1_3_2_2_42_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In ICML."},{"key":"e_1_3_2_2_43_1","unstructured":"Joshua David Robinson Ching-Yao Chuang Suvrit Sra and Stefanie Jegelka. 2021. Contrastive Learning with Hard Negative Samples. In ICLR."},{"key":"e_1_3_2_2_44_1","volume-title":"Antoine Chassang, Carlo Gatta, and Yoshua Bengio.","author":"Romero Adriana","year":"2015","unstructured":"Adriana Romero, Nicolas Ballas, Samira Ebrahimi Kahou, Antoine Chassang, Carlo Gatta, and Yoshua Bengio. 2015. FitNets: Hints for Thin Deep Nets. In ICLR."},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"crossref","unstructured":"Christian Szegedy Vincent Vanhoucke Sergey Ioffe Jonathon Shlens and Zbigniew Wojna. 2016. Rethinking the inception architecture for computer vision. In CVPR.","DOI":"10.1109\/CVPR.2016.308"},{"key":"e_1_3_2_2_46_1","first-page":"2457","article-title":"Emoticon, emoji, and sticker use in computer-mediated communication: A review of theories and research findings","volume":"13","author":"Tang Ying","year":"2019","unstructured":"Ying Tang and Khe Foon Hew. 2019. Emoticon, emoji, and sticker use in computer-mediated communication: A review of theories and research findings. International Journal of Communication, Vol. 13 (2019), 2457--2483.","journal-title":"International Journal of Communication"},{"key":"e_1_3_2_2_47_1","unstructured":"Chongyang Tao Wei Wu Can Xu Wenpeng Hu Dongyan Zhao and Rui Yan. 2019. Multi-representation fusion network for multi-turn response selection in retrieval-based chatbots. In ACM WSDM."},{"key":"e_1_3_2_2_48_1","volume-title":"Attention is all you need. NeurIPS","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. NeurIPS (2017)."},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"crossref","unstructured":"Han Wang and Roy Ka-Wei Lee. 2024. MemeCraft: Contextual and Stance-Driven Multimodal Meme Generation. In WWW.","DOI":"10.1145\/3589334.3648151"},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"crossref","unstructured":"Xingyao Wang and David Jurgens. 2021. An animated picture says at least a thousand words: Selecting gif-based replies in multimodal dialog. In EMNLP.","DOI":"10.18653\/v1\/2021.findings-emnlp.276"},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2636150"},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"crossref","unstructured":"Yu Wu Wei Wu Chen Xing Ming Zhou and Zhoujun Li. 2017. Sequential matching network: A new architecture for multi-turn response selection in retrieval-based chatbots. In ACL.","DOI":"10.18653\/v1\/P17-1046"},{"key":"e_1_3_2_2_53_1","unstructured":"Xueting Yan Ishan Misra Abhinav Gupta Deepti Ghadiyaram and Dhruv Mahajan. 2020. ClusterFit: Improving generalization of visual representations. In CVPR."},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"crossref","unstructured":"Lin Yang Yi Shen Yue Mao and Longjun Cai. 2022. Hybrid curriculum learning for emotion recognition in conversation. In AAAI.","DOI":"10.1609\/aaai.v36i10.21413"},{"key":"e_1_3_2_2_55_1","doi-asserted-by":"crossref","unstructured":"Linfeng Zhang Xin Chen Xiaobing Tu Pengfei Wan Ning Xu and Kaisheng Ma. 2022. Wavelet knowledge distillation: Towards efficient image-to-image translation. In CVPR.","DOI":"10.1109\/CVPR52688.2022.01214"},{"key":"e_1_3_2_2_56_1","volume-title":"StickerConv: Generating Multimodal Empathetic Responses from Scratch. arXiv preprint arXiv:2402.01679","author":"Zhang Yiqun","year":"2024","unstructured":"Yiqun Zhang, Fanheng Kong, Peidong Wang, Shuang Sun, Lingshuai Wang, Shi Feng, Daling Wang, Yifei Zhang, and Kaisong Song. 2024. StickerConv: Generating Multimodal Empathetic Responses from Scratch. arXiv preprint arXiv:2402.01679 (2024)."},{"key":"e_1_3_2_2_57_1","doi-asserted-by":"crossref","unstructured":"Zhexin Zhang Yeshuang Zhu Zhengcong Fei Jinchao Zhang and Jie Zhou. 2022. Selecting stickers in open-domain dialogue through multitask learning. In ACL.","DOI":"10.18653\/v1\/2022.findings-acl.241"},{"key":"e_1_3_2_2_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3094362"},{"key":"e_1_3_2_2_59_1","doi-asserted-by":"publisher","DOI":"10.1145\/3025453.3025800"},{"key":"e_1_3_2_2_60_1","volume-title":"Dianhai Yu, and Hua Wu.","author":"Zhou Xiangyang","year":"2018","unstructured":"Xiangyang Zhou, Lu Li, Daxiang Dong, Yi Liu, Ying Chen, Wayne Xin Zhao, Dianhai Yu, and Hua Wu. 2018. Multi-turn response selection for chatbots with deep attention matching network. In ACL."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680987","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680987","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:35Z","timestamp":1750295855000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680987"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":60,"alternative-id":["10.1145\/3664647.3680987","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680987","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}