{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,16]],"date-time":"2026-02-16T18:16:12Z","timestamp":1771265772796,"version":"3.50.1"},"publisher-location":"Singapore","reference-count":54,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819947515","type":"print"},{"value":"9789819947522","type":"electronic"}],"license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023]]},"DOI":"10.1007\/978-981-99-4752-2_50","type":"book-chapter","created":{"date-parts":[[2023,7,30]],"date-time":"2023-07-30T16:02:10Z","timestamp":1690732930000},"page":"609-622","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["A Survey on Multimodal Named Entity Recognition"],"prefix":"10.1007","author":[{"given":"Shenyi","family":"Qian","sequence":"first","affiliation":[]},{"given":"Wenduo","family":"Jin","sequence":"additional","affiliation":[]},{"given":"Yonggang","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Jiangtao","family":"Ma","sequence":"additional","affiliation":[]},{"given":"Yaqiong","family":"Qiao","sequence":"additional","affiliation":[]},{"given":"Jinyu","family":"Lu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,7,31]]},"reference":[{"key":"50_CR1","doi-asserted-by":"crossref","unstructured":"Moon, S., Neves, L., Carvalho, V.: Multimodal named entity recognition for short social media posts. In: NAACL HLT 2018 - 2018 Conference on North American Chapter Association Computing Linguistic Human Language Technology \u2013 Proceedings of Conference, vol. 1, pp. 852\u2013860 (2018)","DOI":"10.18653\/v1\/N18-1078"},{"key":"50_CR2","doi-asserted-by":"publisher","first-page":"50","DOI":"10.1109\/TKDE.2020.2981314","volume":"34","author":"J Li","year":"2022","unstructured":"Li, J., Sun, A., Han, J., Li, C.: A survey on deep learning for named entity recognition. IEEE Trans. Knowl. Data Eng. 34, 50\u201370 (2022)","journal-title":"IEEE Trans. Knowl. Data Eng."},{"key":"50_CR3","doi-asserted-by":"crossref","unstructured":"Xu, B., Huang, S., Sha, C., Wang, H.: MAF: a general matching and alignment framework for multimodal named entity recognition. In: WSDM 2022 \u2013 Proceedings of the 15th ACM International Conference on Web Search Data Mining, pp. 1215\u20131223 (2022)","DOI":"10.1145\/3488560.3498475"},{"key":"50_CR4","doi-asserted-by":"crossref","unstructured":"Zhang, Q., Fu, J., Liu, X., Huang, X.: Adaptive co-attention network for named entity recognition in tweets. In: 32nd AAAI Conference on Artificial Intelligence. AAAI 2018, pp. 5674\u20135681 (2018)","DOI":"10.1609\/aaai.v32i1.11962"},{"key":"50_CR5","doi-asserted-by":"crossref","unstructured":"Lu, D., Neves, L., Carvalho, V., Zhang, N., Ji, H.: Visual attention model for name tagging in multimodal social media. ACL 2018 - 56th Annual Meeting of the Association for Computational Linguistics Proceedings of the Conference (Long Pap. 1, 1990\u20131999 (2018)","DOI":"10.18653\/v1\/P18-1185"},{"key":"50_CR6","doi-asserted-by":"crossref","unstructured":"Zhang, D., Wei, S., Li, S., Wu, H., Zhu, Q., Zhou, G.: Multi-modal graph fusion for named entity recognition with targeted visual guidance. In: 35th AAAI Conference on Artificial Intelligence. AAAI 2021. 16, 14347\u201314355 (2021)","DOI":"10.1609\/aaai.v35i16.17687"},{"key":"50_CR7","doi-asserted-by":"crossref","unstructured":"Sun, L., Wang, J., Zhang, K., Su, Y., Weng, F.: RpBERT: a text-image relation propagation-based BERT model for multimodal NER. In: 35th AAAI Conference on Artificial Intelligence. AAAI 2021, vol. 15, pp. 13860\u201313868 (2021)","DOI":"10.1609\/aaai.v35i15.17633"},{"key":"50_CR8","doi-asserted-by":"publisher","first-page":"927","DOI":"10.1109\/TMM.2017.2760101","volume":"20","author":"Y Hu","year":"2018","unstructured":"Hu, Y., Zheng, L., Yang, Y., Huang, Y.: Twitter100k: a real-world dataset for weakly supervised cross-media retrieval. IEEE Trans. Multimed. 20, 927\u2013938 (2018)","journal-title":"IEEE Trans. Multimed."},{"key":"50_CR9","doi-asserted-by":"publisher","first-page":"1188","DOI":"10.1109\/TRO.2012.2197158","volume":"28","author":"D G\u00e1lvez-L\u00f3pez","year":"2012","unstructured":"G\u00e1lvez-L\u00f3pez, D., Tard\u00f3s, J.D.: Bags of binary words for fast place recognition in image sequences. IEEE Trans. Robot. 28, 1188\u20131197 (2012)","journal-title":"IEEE Trans. Robot."},{"key":"50_CR10","unstructured":"Mikolov, T., Chen, K., Corrado, G., Dean, J.: Efficient estimation of word representations in vector space. In: 1st International Conference on Learning Representations, ICLR 2013 - Workshop Track Proceedings, pp. 1\u201312 (2013)"},{"key":"50_CR11","doi-asserted-by":"crossref","unstructured":"Pennington, J., Socher, R., Manning, C.: Glove: global vectors for word representation. In: Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing, Doha, Qatar, pp. 1532\u20131543. Association for Computational Linguistics (2014)","DOI":"10.3115\/v1\/D14-1162"},{"key":"50_CR12","doi-asserted-by":"crossref","unstructured":"Joulin, A., Grave, E., Bojanowski, P., Mikolov, T.: Bag of tricks for efficient text classification. In: 15th Conference of the European Chapter of the Association for Computational Linguistics, EACL 2017 - Proceedings of Conference, pp. 427\u2013431 (2017)","DOI":"10.18653\/v1\/E17-2068"},{"key":"50_CR13","doi-asserted-by":"crossref","unstructured":"Peters, M.E., et al.: Deep contextualized word representations. In: NAACL HLT 2018 - 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies - Proceedings of the Conference, pp. 2227\u20132237 (2018)","DOI":"10.18653\/v1\/N18-1202"},{"key":"50_CR14","unstructured":"McCann, B., Bradbury, J., Xiong, C., Socher, R.: Learned in translation: Contextualized word vectors. In: Advances in Neural Information Processing Systems, pp. 6295\u20136306 (2017)"},{"key":"50_CR15","unstructured":"Conneau, A., Lample, G.: Cross-lingual language model pretraining. In: Advances in Neural Information Processing Systems, pp. 1\u201311 (2019)"},{"key":"50_CR16","unstructured":"Radford, A., Narasimhan, K.: Improving Language Understanding by Generative Pre-Training. Presented at the (2018)"},{"key":"50_CR17","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. NAACL HLT 2019 - 2019 Conference of the North American Chapter of the Association for Computational Linguistics, Human Language Technologies \u2013 Proceedings of the Conference, vol. 1, pp. 4171\u20134186 (2019)"},{"key":"50_CR18","unstructured":"Liu, Y., et al.: RoBERTa: a robustly optimized Bert pretraining approach. ArXiv.abs\/1907.1 (2019)"},{"key":"50_CR19","unstructured":"Lan, Z., Chen, M., Goodman, S., Gimpel, K., Sharma, P., Soricut, R.: ALBERT: a lite BERT for self-supervised learning of language representations. ArXiv.abs\/1909.1 (2019)"},{"key":"50_CR20","doi-asserted-by":"publisher","first-page":"2278","DOI":"10.1109\/5.726791","volume":"86","author":"Y LeCun","year":"1998","unstructured":"LeCun, Y., Bottou, L., Bengio, Y., Haffner, P.: Gradient-based learning applied to document recognition. Proc. IEEE. 86, 2278\u20132323 (1998)","journal-title":"Proc. IEEE."},{"key":"50_CR21","doi-asserted-by":"crossref","unstructured":"Misra, I., van der Maaten, L.: Self-supervised learning of pretext-invariant representations. In: Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition, pp. 6706\u20136716 (2020)","DOI":"10.1109\/CVPR42600.2020.00674"},{"key":"50_CR22","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. CoRR. abs\/1409.1 (2014)"},{"key":"50_CR23","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Vanhoucke, V., Ioffe, S., Shlens, J., Wojna, Z.: Rethinking the inception architecture for computer vision. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 2818\u20132826 (2016)","DOI":"10.1109\/CVPR.2016.308"},{"key":"50_CR24","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"50_CR25","doi-asserted-by":"crossref","unstructured":"Xie, S., Girshick, R., Doll\u00e1r, P., Tu, Z., He, K.: Aggregated Residual Transformations for Deep Neural Networks. In: 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5987\u20135995 (2017)","DOI":"10.1109\/CVPR.2017.634"},{"key":"50_CR26","doi-asserted-by":"crossref","unstructured":"Huang, G., Liu, Z., Van Der Maaten, L., Weinberger, K.Q.: Densely Connected Convolutional Networks. In: 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 2261\u20132269 (2017)","DOI":"10.1109\/CVPR.2017.243"},{"key":"50_CR27","unstructured":"Chen, Y., Li, J., Xiao, H., Jin, X., Yan, S., Feng, J.: Dual path networks. In: Advances in Neural Information Processing Systems. pp. 4468\u20134476 (2017)"},{"key":"50_CR28","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. ArXiv. abs\/2010.1 (2020)"},{"key":"50_CR29","unstructured":"Li, L.H., Yatskar, M., Yin, D., Hsieh, C.-J., Chang, K.-W.: Visualbert: a simple and performant baseline for vision and language. arXiv Prepr. arXiv1908.03557. (2019)"},{"key":"50_CR30","doi-asserted-by":"publisher","first-page":"1137","DOI":"10.1109\/TPAMI.2016.2577031","volume":"39","author":"S Ren","year":"2017","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. IEEE Trans. Pattern Anal. Mach. Intell. 39, 1137\u20131149 (2017)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"50_CR31","doi-asserted-by":"publisher","first-page":"38","DOI":"10.1007\/s11633-022-1369-5","volume":"20","author":"F-L Chen","year":"2023","unstructured":"Chen, F.-L., et al.: Vlp: A survey on vision-language pre-training. Mach. Intell. Res. 20, 38\u201356 (2023)","journal-title":"Mach. Intell. Res."},{"key":"50_CR32","doi-asserted-by":"crossref","unstructured":"Li, G., Duan, N., Fang, Y., Gong, M., Jiang, D.: Unicoder-VL: A universal encoder for vision and language by cross-modal pre-training. In: AAAI 2020 - 34th AAAI Conference on Artificial Intelligence, pp. 11336\u201311344 (2020)","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"50_CR33","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: ViLBERT: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In: Advances in Neural Information Processing Systems, pp. 1\u201311 (2019)"},{"key":"50_CR34","doi-asserted-by":"crossref","unstructured":"Tan, H., Bansal, M.: LXMERT: learning cross-modality encoder representations from transformers. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), Hong Kong, China, pp. 5100\u20135111. Association for Computational Linguistics (2019)","DOI":"10.18653\/v1\/D19-1514"},{"key":"50_CR35","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning (2021)"},{"key":"50_CR36","unstructured":"Li, J., Selvaraju, R.R., Gotmare, A.D., Joty, S., Xiong, C., Hoi, S.C.H.: Align before fuse: vision and language representation learning with momentum distillation. In: Advances in Neural Information Processing Systems, pp. 9694\u20139705 (2021)"},{"key":"50_CR37","doi-asserted-by":"publisher","first-page":"12","DOI":"10.1016\/j.neucom.2021.01.060","volume":"439","author":"Y Tian","year":"2021","unstructured":"Tian, Y., Sun, X., Yu, H., Li, Y., Fu, K.: Hierarchical self-adaptation network for multimodal named entity recognition in social media. Neurocomputing 439, 12\u201321 (2021)","journal-title":"Neurocomputing"},{"key":"50_CR38","doi-asserted-by":"crossref","unstructured":"Sun, L., et al.: RIVA: a pre-trained tweet multimodal model based on text-image relation for multimodal NER. In: COLING 2020 - 28th International Conference on Computational Linguistics, Proceedings of the Conference, pp. 1852\u20131862 (2020)","DOI":"10.18653\/v1\/2020.coling-main.168"},{"key":"50_CR39","doi-asserted-by":"crossref","unstructured":"Wu, Z., Zheng, C., Cai, Y., Chen, J., Leung, H.F., Li, Q.: Multimodal representation with embedded visual guiding objects for named entity recognition in social media posts. In: MM 2020 \u2013 Proceedings of the 28th ACM International Conference on Multimedia, pp. 1038\u20131046 (2020)","DOI":"10.1145\/3394171.3413650"},{"key":"50_CR40","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2020.114101","volume":"167","author":"C Suman","year":"2021","unstructured":"Suman, C., Reddy, S.M., Saha, S., Bhattacharyya, P.: Why pay more? A simple and efficient named entity recognition system for tweets. Expert Syst. Appl. 167, 114101 (2021)","journal-title":"Expert Syst. Appl."},{"key":"50_CR41","doi-asserted-by":"publisher","first-page":"2520","DOI":"10.1109\/TMM.2020.3013398","volume":"23","author":"C Zheng","year":"2021","unstructured":"Zheng, C., Wu, Z., Wang, T., Cai, Y., Li, Q.: Object-aware multimodal named entity recognition in social media posts with adversarial learning. IEEE Trans. Multimed. 23, 2520\u20132532 (2021)","journal-title":"IEEE Trans. Multimed."},{"key":"50_CR42","doi-asserted-by":"crossref","unstructured":"Shahzad, M., Amin, A., Esteves, D., Ngomo, A.C.N.: InferNER: an attentive model leveraging the sentence-level information for Named Entity Recognition in Microblogs. Proceedings of the International Florida Artificial Intelligence Research Society Conference, FLAIRS, vol. 34 (2021)","DOI":"10.32473\/flairs.v34i1.128538"},{"key":"50_CR43","doi-asserted-by":"publisher","unstructured":"Wu, H., Cheng, S., Wang, J., Li, S., Chi, L.: Multimodal aspect extraction with region-aware alignment network. In: Zhu, X., Zhang, M., Hong, Y., He, R. (eds.) NLPCC 2020. LNCS (LNAI), vol. 12430, pp. 145\u2013156. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-60450-9_12","DOI":"10.1007\/978-3-030-60450-9_12"},{"key":"50_CR44","first-page":"4378","volume":"31","author":"G Collell","year":"2017","unstructured":"Collell, G., Zhang, T., Moens, M.: Imagined visual representations as multimodal embeddings. Proc. AAAI Conf. Artif. Intell. 31, 4378\u20134384 (2017)","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"50_CR45","doi-asserted-by":"crossref","unstructured":"Yu, J., Jiang, J., Yang, L., Xia, R.: Improving multimodal named entity recognition via entity span detection with unified multimodal transformer. In: Proceedings of the Annual Meeting of the Association for Computational Linguistics, pp. 3342\u20133352 (2020)","DOI":"10.18653\/v1\/2020.acl-main.306"},{"key":"50_CR46","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"186","DOI":"10.1007\/978-3-030-73197-7_12","volume-title":"Database Systems for Advanced Applications","author":"D Chen","year":"2021","unstructured":"Chen, D., Li, Z., Gu, B., Chen, Z.: Multimodal named entity recognition with image attributes and image knowledge. In: Jensen, C.S., et al. (eds.) DASFAA 2021. LNCS, vol. 12682, pp. 186\u2013201. Springer, Cham (2021). https:\/\/doi.org\/10.1007\/978-3-030-73197-7_12"},{"key":"50_CR47","unstructured":"Lu, J., Zhang, D., Zhang, J., Zhang, P.: Flat Multi-modal interaction transformer for named entity recognition. In: Proceedings of the 29th International Conference on Computational Linguistics, Gyeongju, Republic of Korea, pp. 2055\u20132064. International Committee on Computational Linguistics (2022)"},{"key":"50_CR48","doi-asserted-by":"crossref","unstructured":"Zhao, F., Li, C., Wu, Z., Xing, S., Dai, X.: Learning from Different text-image Pairs: A Relation-enhanced Graph Convolutional Network for Multimodal NER. Association for Computing Machinery (2022)","DOI":"10.1145\/3503161.3548228"},{"key":"50_CR49","doi-asserted-by":"crossref","unstructured":"Chen, X., et al.: Good visual guidance makes a better extractor: hierarchical visual prefix for multimodal entity and relation extraction. In: Findings of the Association for Computational Linguistics. NAACL 2022 - Find., pp. 1607\u20131618 (2022)","DOI":"10.18653\/v1\/2022.findings-naacl.121"},{"key":"50_CR50","doi-asserted-by":"crossref","unstructured":"Wang, X., et al.: CAT-MNER: multimodal named entity recognition with knowledge-refined cross-modal attention. In: 2022 IEEE International Conference on Multimedia and Expo (ICME), pp. 1\u20136 (2022)","DOI":"10.1109\/ICME52920.2022.9859972"},{"key":"50_CR51","unstructured":"Jia, M., et al.: MNER-QG: an end-to-end MRC framework for Multimodal named entity recognition with query grounding. arXiv Prepr. arXiv2211.14739 (2022)"},{"key":"50_CR52","doi-asserted-by":"crossref","unstructured":"Cadene, R., Ben-younes, H., Cord, M., Thome, N.: MUREL: multimodal relational reasoning for visual question answering Sorbonne Universit. In: Conservatoire National des Arts et M. Cvpr 2019, pp. 1989--1998 (2019)","DOI":"10.1109\/CVPR.2019.00209"},{"key":"50_CR53","doi-asserted-by":"crossref","unstructured":"Zhang, X., Yuan, J., Li, L., Liu, J.: Reducing the Bias of Visual Objects in Multimodal Named Entity Recognition. Association for Computing Machinery (2023)","DOI":"10.1145\/3539597.3570485"},{"key":"50_CR54","doi-asserted-by":"crossref","unstructured":"Arshad, O., Gallo, I., Nawaz, S., Calefati, A.: Aiding intra-text representations with visual context for multimodal named entity recognition. In: Proceedings International Conference Document Analysis, Recognition, ICDAR, pp. 337\u2013342 (2019)","DOI":"10.1109\/ICDAR.2019.00061"}],"container-title":["Lecture Notes in Computer Science","Advanced Intelligent Computing Technology and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-99-4752-2_50","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,8,1]],"date-time":"2023-08-01T23:14:35Z","timestamp":1690931675000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-99-4752-2_50"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"ISBN":["9789819947515","9789819947522"],"references-count":54,"URL":"https:\/\/doi.org\/10.1007\/978-981-99-4752-2_50","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023]]},"assertion":[{"value":"31 July 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICIC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Intelligent Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Zhengzhou","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"10 August 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"13 August 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"19","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icic2023a","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/www.ic-icc.cn\/2023\/index.htm","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}