{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,30]],"date-time":"2026-03-30T08:53:49Z","timestamp":1774860829080,"version":"3.50.1"},"reference-count":34,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2026,3,30]],"date-time":"2026-03-30T00:00:00Z","timestamp":1774828800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,3,30]],"date-time":"2026-03-30T00:00:00Z","timestamp":1774828800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2023YFB3308502"],"award-info":[{"award-number":["2023YFB3308502"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int. J. Mach. Learn. &amp; Cyber."],"published-print":{"date-parts":[[2026,5]]},"DOI":"10.1007\/s13042-026-03069-6","type":"journal-article","created":{"date-parts":[[2026,3,30]],"date-time":"2026-03-30T08:03:39Z","timestamp":1774857819000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Meme selection via multiple-choice with masked language models in multimodal dialogue"],"prefix":"10.1007","volume":"17","author":[{"given":"Bo","family":"Liu","sequence":"first","affiliation":[]},{"given":"Xiaojun","family":"Wu","sequence":"additional","affiliation":[]},{"given":"Jianhao","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Bowen","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Li","family":"Zhu","sequence":"additional","affiliation":[]},{"given":"Weiping","family":"Ding","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,3,30]]},"reference":[{"key":"3069_CR1","doi-asserted-by":"publisher","unstructured":"Zhou N., Jurgens D., Bamman D. (2024) Social meme-ing: Measuring linguistic variation in memes. In: Duh, K., Gomez, H., Bethard, S. (eds.) Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pp. 3005\u20133024. Association for Computational Linguistics, Mexico City, Mexico. https:\/\/doi.org\/10.18653\/v1\/2024.naacl-long.166","DOI":"10.18653\/v1\/2024.naacl-long.166"},{"key":"3069_CR2","doi-asserted-by":"publisher","unstructured":"Das M., Mukherjee A. (2023) BanglaAbuseMeme: A dataset for Bengali abusive meme classification. In: Bouamor, H., Pino, J., Bali, K. (eds.) Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, pp. 15498\u201315512. Association for Computational Linguistics, Singapore. https:\/\/doi.org\/10.18653\/v1\/2023.emnlp-main.959","DOI":"10.18653\/v1\/2023.emnlp-main.959"},{"key":"3069_CR3","doi-asserted-by":"crossref","unstructured":"Zhong S., Huang Z., Gao S., Wen W., Lin L., Zitnik M., Zhou P. (2024) Let\u2019s think outside the box: Exploring leap-of-thought in large language models with creative humor generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13246\u201313257","DOI":"10.1109\/CVPR52733.2024.01258"},{"key":"3069_CR4","doi-asserted-by":"crossref","unstructured":"Zhong Y., Baghel B.K.(2024) Multimodal understanding of memes with fair explanations. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2007\u20132017","DOI":"10.1109\/CVPRW63382.2024.00206"},{"key":"3069_CR5","doi-asserted-by":"crossref","unstructured":"Barbieri F., Ballesteros M., Saggion H. (2017) Are emojis predictable? arXiv preprint arXiv:1702.07285","DOI":"10.18653\/v1\/E17-2017"},{"key":"3069_CR6","doi-asserted-by":"crossref","unstructured":"Gao S., Chen X., Liu C., Liu L., Zhao D., Yan R. (2020) Learning to respond with stickers: A framework of unifying multi-modality in multi-turn dialog. In: Proceedings of the Web Conference 2020, pp. 1138\u20131148","DOI":"10.1145\/3366423.3380191"},{"key":"3069_CR7","doi-asserted-by":"crossref","unstructured":"Das A., Kottur S., Gupta K., Singh A., Yadav, D., Moura J.M., Parikh D., Batra D. (2017) Visual dialog. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 326\u2013335","DOI":"10.1109\/CVPR.2017.121"},{"key":"3069_CR8","doi-asserted-by":"crossref","unstructured":"Qi J., Niu Y., Huang J., Zhang H. (2020) Two causal principles for improving visual dialog. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10860\u201310869","DOI":"10.1109\/CVPR42600.2020.01087"},{"key":"3069_CR9","doi-asserted-by":"crossref","unstructured":"Yan R., Zhao D. (2018) Coupled context modeling for deep chit-chat: towards conversations between human and computer. In: Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, pp. 2574\u20132583","DOI":"10.1145\/3219819.3220045"},{"key":"3069_CR10","doi-asserted-by":"crossref","unstructured":"Yan R., Zhao DEW. (2017) Joint learning of response ranking and next utterance suggestion in human-computer conversation system. In: Proceedings of the 40th International ACM Sigir Conference on Research and Development in Information Retrieval, pp. 685\u2013694","DOI":"10.1145\/3077136.3080843"},{"key":"3069_CR11","doi-asserted-by":"crossref","unstructured":"Zhang Z., Zhu Y., Fei Z., Zhang J., Zhou J. (2022) Selecting stickers in open-domain dialogue through multitask learning. In: Findings of the Association for Computational Linguistics: ACL 2022, pp. 3053\u20133060","DOI":"10.18653\/v1\/2022.findings-acl.241"},{"key":"3069_CR12","doi-asserted-by":"publisher","DOI":"10.1016\/j.engappai.2024.109884","volume":"142","author":"X Zhuang","year":"2025","unstructured":"Zhuang X, Li Z, Zhang C et al (2025) A cross-modal collaborative guiding network for sarcasm explanation in multi-modal multi-party dialogues. Eng Appl Artif Intell 142:109884","journal-title":"Eng Appl Artif Intell"},{"issue":"5","key":"3069_CR13","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3722115","volume":"21","author":"X Zhuang","year":"2025","unstructured":"Zhuang X, Zhou F, Li Z (2025) Multi-modal sarcasm detection via knowledge-aware focused graph convolutional networks. ACM Trans Multimed Comput Commun Appl 21(5):1\u201322","journal-title":"ACM Trans Multimed Comput Commun Appl"},{"key":"3069_CR14","unstructured":"Lee JDMCK., Toutanova K. (2018) Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805"},{"key":"3069_CR15","doi-asserted-by":"crossref","unstructured":"Zellers R., Bisk Y., Schwartz R., Choi Y. (2018)Swag: A large-scale adversarial dataset for grounded commonsense inference. arXiv preprint arXiv:1808.05326","DOI":"10.18653\/v1\/D18-1009"},{"key":"3069_CR16","doi-asserted-by":"crossref","unstructured":"Wang Y., Berant J., Liang P. (2015) Building a semantic parser overnight. Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing (Volume 1: Long Papers).","DOI":"10.3115\/v1\/P15-1129"},{"key":"3069_CR17","doi-asserted-by":"crossref","unstructured":"Li C., Deng C., Li N., Liu W., Gao X., Tao D. (2018) Self-supervised adversarial hashing networks for cross-modal retrieval. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4242\u20134251","DOI":"10.1109\/CVPR.2018.00446"},{"issue":"1","key":"3069_CR18","doi-asserted-by":"publisher","first-page":"1023","DOI":"10.1109\/TCSS.2023.3247445","volume":"11","author":"S Kumar","year":"2023","unstructured":"Kumar S, Kumar D, Singh SR (2023) Gated recursive and sequential deep hierarchical encoding for detecting incongruent news articles. IEEE Transactions on Computational Social Systems 11(1):1023\u20131034","journal-title":"IEEE Transactions on Computational Social Systems"},{"key":"3069_CR19","unstructured":"Lu J., Batra D., Parikh D., Lee S. (2019) Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Adv Neural Inform Processing Syst 32"},{"key":"3069_CR20","unstructured":"Li L., Yatskar M., Yin D., Hsieh C., Chang K. (2019) A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557"},{"key":"3069_CR21","doi-asserted-by":"crossref","unstructured":"Pandey A., Vishwakarma D.K. (2023) Vabdc-net: A framework for visual-caption sentiment recognition via spatio-depth visual attention and bi-directional caption processing. In: Knowledge-Based Systems, 269:110515","DOI":"10.1016\/j.knosys.2023.110515"},{"key":"3069_CR22","doi-asserted-by":"publisher","DOI":"10.1016\/j.asoc.2023.111206","volume":"152","author":"A Pandey","year":"2024","unstructured":"Pandey A, Vishwakarma DK (2024) Progress, achievements, and challenges in multimodal sentiment analysis using deep learning: a survey. Appl Soft Comput 152:111206","journal-title":"Appl Soft Comput"},{"key":"3069_CR23","doi-asserted-by":"crossref","unstructured":"Aggrawal S., Pandey A., Vishwakarma D.K. (2023) Multimodal sarcasm recognition by fusing textual, visual and acoustic content via multi-headed attention. In: Proceedings of WCONF","DOI":"10.1109\/WCONF58270.2023.10235179"},{"key":"3069_CR24","unstructured":"Aggrawal S., Pandey A., Vishwakarma D.K. (2024) Modeling visual semantics via image captioning to extract multi-level cross-modal semantic incongruity with attention"},{"key":"3069_CR25","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown T, Mann B, Ryder N, Subbiah M, Kaplan JD, Dhariwal P, Neelakantan A, Shyam P, Sastry G, Askell A et al (2020) Language models are few-shot learners. Adv Neural Inf Process Syst 33:1877\u20131901","journal-title":"Adv Neural Inf Process Syst"},{"key":"3069_CR26","doi-asserted-by":"crossref","unstructured":"Schick T., Sch\u00fctze H. (2020) It\u2019s not just size that matters: Small language models are also few-shot learners. arXiv preprint arXiv:2009.07118","DOI":"10.18653\/v1\/2021.naacl-main.185"},{"key":"3069_CR27","unstructured":"Gao T., Fisch A., Chen D. (2020) Making pre-trained language models better few-shot learners. arXiv preprint arXiv:2012.15723"},{"key":"3069_CR28","doi-asserted-by":"crossref","unstructured":"Lester B., Al-Rfou, R., Constant N. (2021) The power of scale for parameter-efficient prompt tuning. arXiv preprint arXiv:2104.08691","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"3069_CR29","unstructured":"Liu X., Zheng Y., Du Z., Ding M., Qian Y., Yang Z., Tang J. (2021) Gpt understands, too. arXiv preprint arXiv:2103.10385"},{"key":"3069_CR30","doi-asserted-by":"crossref","unstructured":"Liu X., Ji K., Fu Y., Du Z., Yang Z., Tang J. (2021) P-tuning v2: Prompt tuning can be comparable to fine-tuning universally across scales and tasks. arXiv preprint arXiv:2110.07602","DOI":"10.18653\/v1\/2022.acl-short.8"},{"key":"3069_CR31","unstructured":"Li X.L., Liang P. (2021) Prefix-tuning: Optimizing continuous prompts for generation. arXiv preprint arXiv:2101.00190"},{"key":"3069_CR32","unstructured":"Fei Z., Li Z., Zhang J., Feng Y., Zhou J. (2021) Towards expressive communication with internet memes: a new multimodal conversation dataset and benchmark. arXiv preprint arXiv:2109.01839"},{"key":"3069_CR33","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et al. (2021) Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763 . PMLR"},{"key":"3069_CR34","unstructured":"Achiam J., Adler S., Agarwal S., Ahmad L., Akkaya I., Aleman F.L., Almeida D., Altenschmidt J., Altman S., Anadkat S., et al. (2023) Gpt-4 technical report. arXiv preprint arXiv:2303.08774"}],"container-title":["International Journal of Machine Learning and Cybernetics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13042-026-03069-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s13042-026-03069-6","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13042-026-03069-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,30]],"date-time":"2026-03-30T08:03:59Z","timestamp":1774857839000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s13042-026-03069-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,30]]},"references-count":34,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2026,5]]}},"alternative-id":["3069"],"URL":"https:\/\/doi.org\/10.1007\/s13042-026-03069-6","relation":{},"ISSN":["1868-8071","1868-808X"],"issn-type":[{"value":"1868-8071","type":"print"},{"value":"1868-808X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,3,30]]},"assertion":[{"value":"23 May 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"1 March 2026","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"30 March 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no Conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"239"}}