{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T19:33:57Z","timestamp":1777577637493,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":40,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T00:00:00Z","timestamp":1717027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"The Key Program of the National Natural Science Foundation of China","award":["62237001"],"award-info":[{"award-number":["62237001"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,30]]},"DOI":"10.1145\/3652583.3658097","type":"proceedings-article","created":{"date-parts":[[2024,6,7]],"date-time":"2024-06-07T06:30:40Z","timestamp":1717741840000},"page":"469-477","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["VEC-MNER: Hybrid Transformer with Visual-Enhanced Cross-Modal Multi-level Interaction for Multimodal NER"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4479-8248","authenticated-orcid":false,"given":"Pengfei","family":"Wei","sequence":"first","affiliation":[{"name":"Guangdong University of Technology, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-2815-1524","authenticated-orcid":false,"given":"Hongjun","family":"Ouyang","sequence":"additional","affiliation":[{"name":"Guangdong University of Technology, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3199-2631","authenticated-orcid":false,"given":"Qintai","family":"Hu","sequence":"additional","affiliation":[{"name":"Guangdong University of Technology, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8596-8333","authenticated-orcid":false,"given":"Bi","family":"Zeng","sequence":"additional","affiliation":[{"name":"Guangdong University of Technology, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-4450-597X","authenticated-orcid":false,"given":"Guang","family":"Feng","sequence":"additional","affiliation":[{"name":"Guangdong University of Technology, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-2182-8949","authenticated-orcid":false,"given":"Qingpeng","family":"Wen","sequence":"additional","affiliation":[{"name":"Guangdong University of Technology, Guangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2024,6,7]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01505"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2019.00061"},{"key":"e_1_3_2_1_3_1","volume-title":"CWI: A multimodal deep learning approach for named entity recognition from social media using character, word and image features. Neural Computing and Applications","author":"Asgari-Chenaghlu Meysam","year":"2022","unstructured":"Meysam Asgari-Chenaghlu, M Reza Feizi-Derakhshi, Leili Farzinvash, MA Balafar, and Cina Motamed. 2022. CWI: A multimodal deep learning approach for named entity recognition from social media using character, word and image features. Neural Computing and Applications (2022), 1--18."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-73197-7_12"},{"key":"e_1_3_2_1_5_1","volume-title":"International conference on machine learning. PMLR, 1597--1607","author":"Chen Ting","year":"2020","unstructured":"Ting Chen, Simon Kornblith, Mohammad Norouzi, and Geoffrey Hinton. 2020. A simple framework for contrastive learning of visual representations. In International conference on machine learning. PMLR, 1597--1607."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-naacl.121"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.169"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_11_1","volume-title":"Bidirectional LSTM-CRF models for sequence tagging. arXiv preprint arXiv:1508.01991","author":"Huang Zhiheng","year":"2015","unstructured":"Zhiheng Huang, Wei Xu, and Kai Yu. 2015. Bidirectional LSTM-CRF models for sequence tagging. arXiv preprint arXiv:1508.01991 (2015)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548427"},{"key":"e_1_3_2_1_13_1","volume-title":"Proceedings of NAACL-HLT. 4171--4186","author":"Ming-Wei Chang Jacob Devlin","year":"2019","unstructured":"Jacob Devlin Ming-Wei Chang Kenton and Lee Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of NAACL-HLT. 4171--4186."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N16-1030"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-short.135"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1185"},{"key":"e_1_3_2_1_17_1","volume-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems 32","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_1_18_1","volume-title":"Proceedings of the 29th International Conference on Computational Linguistics. 2055--2064","author":"Lu Junyu","year":"2022","unstructured":"Junyu Lu, Dixiang Zhang, Jiaxing Zhang, and Pingjian Zhang. 2022. Flat Multimodal Interaction Transformer for Named Entity Recognition. In Proceedings of the 29th International Conference on Computational Linguistics. 2055--2064."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P16-1101"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1186"},{"key":"e_1_3_2_1_21_1","volume-title":"Multimodal named entity recognition for short social media posts. arXiv preprint arXiv:1802.07862","author":"Moon Seungwhan","year":"2018","unstructured":"Seungwhan Moon, Leonardo Neves, and Vitor Carvalho. 2018. Multimodal named entity recognition for short social media posts. arXiv preprint arXiv:1802.07862 (2018)."},{"key":"e_1_3_2_1_22_1","volume-title":"Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748","author":"van den Oord Aaron","year":"2018","unstructured":"Aaron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01019"},{"key":"e_1_3_2_1_24_1","volume-title":"Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers). 2807--2818","author":"Sui Dianbo","year":"2021","unstructured":"Dianbo Sui, Zhengkun Tian, Yubo Chen, Kang Liu, and Jun Zhao. 2021. A largescale chinese multimodal ner dataset with speech clues. In Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers). 2807--2818."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.coling-main.168"},{"key":"e_1_3_2_1_26_1","first-page":"33536","article-title":"Multi-granularity cross-modal alignment for generalized medical visual representation learning","volume":"35","author":"Wang Fuying","year":"2022","unstructured":"Fuying Wang, Yuyin Zhou, Shujun Wang, Varut Vardhanabhuti, and Lequan Yu. 2022. Multi-granularity cross-modal alignment for generalized medical visual representation learning. Advances in Neural Information Processing Systems 35 (2022), 33536--33549.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1587\/transinf.2022EDP7116"},{"key":"e_1_3_2_1_28_1","volume-title":"Ita: image-text alignments for multi-modal named entity recognition. arXiv preprint arXiv:2112.06482","author":"Wang Xinyu","year":"2021","unstructured":"Xinyu Wang, Min Gui, Yong Jiang, Zixia Jia, Nguyen Bach, Tao Wang, Zhongqiang Huang, Fei Huang, and Kewei Tu. 2021. Ita: image-text alignments for multi-modal named entity recognition. arXiv preprint arXiv:2112.06482 (2021)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-00129-1_24"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.121"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413650"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3488560.3498475"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i10.21409"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00478"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.273"},{"key":"e_1_3_2_1_36_1","volume-title":"Improving multimodal named entity recognition via entity span detection with unified multimodal transformer","author":"Yu Jianfei","unstructured":"Jianfei Yu, Jing Jiang, Li Yang, and Rui Xia. 2020. Improving multimodal named entity recognition via entity span detection with unified multimodal transformer. Association for Computational Linguistics."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i16.17687"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11962"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3539597.3570485"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.3013398"}],"event":{"name":"ICMR '24: International Conference on Multimedia Retrieval","location":"Phuket Thailand","acronym":"ICMR '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia","SIGSOFT ACM Special Interest Group on Software Engineering"]},"container-title":["Proceedings of the 2024 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658097","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3652583.3658097","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T08:48:07Z","timestamp":1755766087000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658097"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,30]]},"references-count":40,"alternative-id":["10.1145\/3652583.3658097","10.1145\/3652583"],"URL":"https:\/\/doi.org\/10.1145\/3652583.3658097","relation":{},"subject":[],"published":{"date-parts":[[2024,5,30]]},"assertion":[{"value":"2024-06-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}