{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,19]],"date-time":"2026-02-19T02:18:42Z","timestamp":1771467522913,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":38,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612095","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:12Z","timestamp":1698391632000},"page":"4555-4563","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":9,"title":["Learning Implicit Entity-object Relations by Bidirectional Generative Alignment for Multimodal NER"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1800-8441","authenticated-orcid":false,"given":"Feng","family":"Chen","sequence":"first","affiliation":[{"name":"Ant Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3020-2677","authenticated-orcid":false,"given":"Jiajia","family":"Liu","sequence":"additional","affiliation":[{"name":"Ant Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4669-8622","authenticated-orcid":false,"given":"Kaixiang","family":"Ji","sequence":"additional","affiliation":[{"name":"Ant Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6076-4895","authenticated-orcid":false,"given":"Wang","family":"Ren","sequence":"additional","affiliation":[{"name":"Ant Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4144-1753","authenticated-orcid":false,"given":"Jian","family":"Wang","sequence":"additional","affiliation":[{"name":"Ant Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1872-2592","authenticated-orcid":false,"given":"Jingdong","family":"Chen","sequence":"additional","affiliation":[{"name":"Ant Group, Hangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"LaT: Latent Translation with Cycle-Consistency for Video-Text Retrieval. arXiv preprint arXiv:2207.04858","author":"Bai Jinbin","year":"2022","unstructured":"Jinbin Bai, Chunhui Liu, Feiyue Ni, Haofan Wang, Mengying Hu, Xiaofeng Guo, and Lele Cheng. 2022. LaT: Latent Translation with Cycle-Consistency for Video-Text Retrieval. arXiv preprint arXiv:2207.04858 (2022)."},{"key":"e_1_3_2_1_2_1","volume-title":"Learning Distinct and Representative Modes for Image Captioning. ArXiv","author":"Chen Qi","year":"2022","unstructured":"Qi Chen, Chaorui Deng, and Qi Wu. 2022a. Learning Distinct and Representative Modes for Image Captioning. ArXiv, Vol. abs\/2209.08231 (2022)."},{"key":"e_1_3_2_1_3_1","volume-title":"Hybrid Transformer with Multi-level Fusion for Multimodal Knowledge Graph Completion. arXiv preprint arXiv:2205.02357","author":"Chen Xiang","year":"2022","unstructured":"Xiang Chen, Ningyu Zhang, Lei Li, Shumin Deng, Chuanqi Tan, Changliang Xu, Fei Huang, Luo Si, and Huajun Chen. 2022b. Hybrid Transformer with Multi-level Fusion for Multimodal Knowledge Graph Completion. arXiv preprint arXiv:2205.02357 (2022)."},{"key":"e_1_3_2_1_4_1","volume-title":"Good Visual Guidance Makes A Better Extractor: Hierarchical Visual Prefix for Multimodal Entity and Relation Extraction. arXiv preprint arXiv:2205.03521","author":"Chen Xiang","year":"2022","unstructured":"Xiang Chen, Ningyu Zhang, Lei Li, Yunzhi Yao, Shumin Deng, Chuanqi Tan, Fei Huang, Luo Si, and Huajun Chen. 2022c. Good Visual Guidance Makes A Better Extractor: Hierarchical Visual Prefix for Multimodal Entity and Relation Extraction. arXiv preprint arXiv:2205.03521 (2022)."},{"key":"e_1_3_2_1_5_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. ArXiv","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. ArXiv, Vol. abs\/1810.04805 (2019)."},{"key":"e_1_3_2_1_6_1","volume-title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. ArXiv","author":"Dosovitskiy Alexey","year":"1929","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2020. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. ArXiv, Vol. abs\/2010.11929 (2020)."},{"key":"e_1_3_2_1_7_1","unstructured":"Satya Krishna Gorti No\u00ebl Vouitsis Junwei Ma Keyvan Golestan Maksims Volkovs Animesh Garg and Guangwei Yu. 2022. X-Pool: Cross-Modal Language-Video Attention for Text-Video Retrieval. In CVPR. 5006--5015."},{"key":"e_1_3_2_1_8_1","volume-title":"Prompt-to-Prompt Image Editing with Cross Attention Control. ArXiv","author":"Hertz Amir","year":"2022","unstructured":"Amir Hertz, Ron Mokady, Jay M. Tenenbaum, Kfir Aberman, Yael Pritch, and Daniel Cohen-Or. 2022. Prompt-to-Prompt Image Editing with Cross Attention Control. ArXiv, Vol. abs\/2208.01626 (2022)."},{"key":"e_1_3_2_1_9_1","volume-title":"Imagen Video: High Definition Video Generation with Diffusion Models. ArXiv","author":"Ho Jonathan","year":"2022","unstructured":"Jonathan Ho, William Chan, Chitwan Saharia, Jay Whang, Ruiqi Gao, Alexey A. Gritsenko, Diederik P. Kingma, Ben Poole, Mohammad Norouzi, David J. Fleet, and Tim Salimans. 2022. Imagen Video: High Definition Video Generation with Diffusion Models. ArXiv, Vol. abs\/2210.02303 (2022)."},{"key":"e_1_3_2_1_10_1","unstructured":"Xiaowei Hu Zhe Gan Jianfeng Wang Zhengyuan Yang Zicheng Liu Yumao Lu and Lijuan Wang. 2022. Scaling up Vision-language Pre-training for Image Captioning. In CVPR. 17980--17989."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"Meihuizi Jia Xin Shen Lei Shen Jinhui Pang Lejian Liao Yang Song Meng Chen and Xiaodong He. 2022. Query Prior Matters: A MRC Framework for Multimodal Named Entity Recognition. In MM. 3549--3558.","DOI":"10.1145\/3503161.3548427"},{"key":"e_1_3_2_1_12_1","volume-title":"TransGAN: Two Transformers Can Make One Strong GAN. ArXiv","author":"Jiang Yifan","year":"2021","unstructured":"Yifan Jiang, Shiyu Chang, and Zhangyang Wang. 2021. TransGAN: Two Transformers Can Make One Strong GAN. ArXiv, Vol. abs\/2102.07074 (2021)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1214\/aoms\/1177729694"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","unstructured":"Guillaume Lample Miguel Ballesteros Sandeep Subramanian Kazuya Kawakami and Chris Dyer. 2016. Neural Architectures for Named Entity Recognition. In NAACL.","DOI":"10.18653\/v1\/N16-1030"},{"key":"e_1_3_2_1_15_1","volume-title":"FLAT: Chinese NER using Flat-lattice Transformer. arXiv preprint arXiv:2004.11795","author":"Li Xiaonan","year":"2020","unstructured":"Xiaonan Li, Hang Yan, Xipeng Qiu, and Xuanjing Huang. 2020. FLAT: Chinese NER using Flat-lattice Transformer. arXiv preprint arXiv:2004.11795 (2020)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10489-021-02546-5"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"Di Lu Leonardo Neves Vitor Carvalho Ning Zhang and Heng Ji. 2018. Visual Attention Model for Name Tagging in Multimodal Social Media. In ACL. 1990--1999.","DOI":"10.18653\/v1\/P18-1185"},{"key":"e_1_3_2_1_18_1","volume-title":"Flat Multi-modal Interaction Transformer for Named Entity Recognition. arXiv preprint arXiv:2208.11039","author":"Lu Junyu","year":"2022","unstructured":"Junyu Lu, Dixiang Zhang, and Pingjian Zhang. 2022. Flat Multi-modal Interaction Transformer for Named Entity Recognition. arXiv preprint arXiv:2208.11039 (2022)."},{"key":"e_1_3_2_1_19_1","volume-title":"Hovy","author":"Ma Xuezhe","year":"2016","unstructured":"Xuezhe Ma and Eduard H. Hovy. 2016. End-to-end Sequence Labeling via Bi-directional LSTM-CNNs-CRF. ArXiv, Vol. abs\/1603.01354 (2016)."},{"key":"e_1_3_2_1_20_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models from Natural Language Supervision. In ICML. 8748--8763."},{"key":"e_1_3_2_1_21_1","first-page":"13937","article-title":"Dynamicvit","volume":"34","author":"Rao Yongming","year":"2021","unstructured":"Yongming Rao, Wenliang Zhao, Benlin Liu, Jiwen Lu, Jie Zhou, and Cho-Jui Hsieh. 2021. Dynamicvit: Efficient Vision Transformers with Dynamic Token Sparsification. NeurIPS, Vol. 34 (2021), 13937--13949.","journal-title":"Efficient Vision Transformers with Dynamic Token Sparsification. NeurIPS"},{"key":"e_1_3_2_1_22_1","first-page":"1137","article-title":"Faster R-CNN","volume":"39","author":"Ren Shaoqing","year":"2015","unstructured":"Shaoqing Ren, Kaiming He, Ross B. Girshick, and Jian Sun. 2015. Faster R-CNN: Towards Real-time Object Detection with Region Proposal Networks. TPAMI, Vol. 39 (2015), 1137--1149.","journal-title":"Towards Real-time Object Detection with Region Proposal Networks. TPAMI"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"crossref","unstructured":"Erik Tjong Kim Sang and Jorn Veenstra. 1999. Representing Text Chunks. In ECACL. 173--179.","DOI":"10.3115\/977035.977059"},{"key":"e_1_3_2_1_24_1","volume-title":"RIVA: A Pre-trained Tweet Multimodal Model Based on Text-image Relation for Multimodal NER. In COLING. 1852--1862.","author":"Sun Lin","year":"2020","unstructured":"Lin Sun, Jiquan Wang, Yindu Su, Fangsheng Weng, Yuxuan Sun, Zengwei Zheng, and Yuanyi Chen. 2020. RIVA: A Pre-trained Tweet Multimodal Model Based on Text-image Relation for Multimodal NER. In COLING. 1852--1862."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i15.17633"},{"key":"e_1_3_2_1_26_1","volume-title":"ITA: Image-text Alignments for Multi-Modal Named Entity Recognition. arXiv preprint arXiv:2112.06482","author":"Wang Xinyu","year":"2021","unstructured":"Xinyu Wang, Min Gui, Yong Jiang, Zixia Jia, Nguyen Bach, Tao Wang, Zhongqiang Huang, Fei Huang, and Kewei Tu. 2021. ITA: Image-text Alignments for Multi-Modal Named Entity Recognition. arXiv preprint arXiv:2112.06482 (2021)."},{"key":"e_1_3_2_1_27_1","volume-title":"Prompt-Based Entity-Related Visual Clue Extraction and Integration for Multimodal Named Entity Recognition. In International Conference on Database Systems for Advanced Applications. 297--305","author":"Wang Xuwu","year":"2022","unstructured":"Xuwu Wang, Junfeng Tian, Min Gui, Zhixu Li, Jiabo Ye, Ming Yan, and Yanghua Xiao. 2022a. Prompt-Based Entity-Related Visual Clue Extraction and Integration for Multimodal Named Entity Recognition. In International Conference on Database Systems for Advanced Applications. 297--305."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"Xuwu Wang Jiabo Ye Zhixu Li Junfeng Tian Yong Jiang Ming Yan Ji Zhang and Yanghua Xiao. 2022b. CAT-MNER: Multimodal Named Entity Recognition with Knowledge-Refined Cross-Modal Attention. In ICME. 1--6.","DOI":"10.1109\/ICME52920.2022.9859972"},{"key":"e_1_3_2_1_29_1","volume-title":"MAF: A General Matching and Alignment Framework for Multimodal Named Entity Recognition. In WSDM. 1215--1223.","author":"Xu Bo","year":"2022","unstructured":"Bo Xu, Shizhou Huang, Chaofeng Sha, and Hongya Wang. 2022. MAF: A General Matching and Alignment Framework for Multimodal Named Entity Recognition. In WSDM. 1215--1223."},{"key":"e_1_3_2_1_30_1","unstructured":"Jianfei Yu Jing Jiang Li Yang and Rui Xia. 2020. Improving Multimodal Named Entity Recognition via Entity Span Detection with Unified Multimodal Transformer. In ACL. 3342--3352."},{"key":"e_1_3_2_1_31_1","volume-title":"Mattnet: Modular Attention Network for Referring Expression Comprehension. In CVPR. 1307--1315.","author":"Yu Licheng","year":"2018","unstructured":"Licheng Yu, Zhe Lin, Xiaohui Shen, Jimei Yang, Xin Lu, Mohit Bansal, and Tamara L Berg. 2018. Mattnet: Modular Attention Network for Referring Expression Comprehension. In CVPR. 1307--1315."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i16.17687"},{"key":"e_1_3_2_1_33_1","volume-title":"Bernal Jim\u00e9nez Guti\u00e9rrez, and Yu Su","author":"Zhang Kai","year":"2023","unstructured":"Kai Zhang, Bernal Jim\u00e9nez Guti\u00e9rrez, and Yu Su. 2023. Aligning instruction tasks unlocks large language models as zero-shot relation extractors. arXiv preprint arXiv:2305.11159 (2023)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11962"},{"key":"e_1_3_2_1_35_1","volume-title":"BERTScore: Evaluating Text Generation with BERT. ArXiv","author":"Zhang Tianyi","year":"2019","unstructured":"Tianyi Zhang, Varsha Kishore, Felix Wu, Kilian Q. Weinberger, and Yoav Artzi. 2019. BERTScore: Evaluating Text Generation with BERT. ArXiv, Vol. abs\/1904.09675 (2019)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"crossref","unstructured":"Fei Zhao Chunhui Li Zhen Wu Shangyu Xing and Xinyu Dai. 2022. Learning from Different Text-image Pairs: A Relation-enhanced Graph Convolutional Network for Multimodal NER. In MM. 3983--3992.","DOI":"10.1145\/3503161.3548228"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Shan Zhao Minghao Hu Zhiping Cai and Fang Liu. 2021. Modeling Dense Cross-modal Interactions for Joint Entity-relation Extraction. In IJCAI. 4032--4038.","DOI":"10.24963\/ijcai.2020\/558"},{"key":"e_1_3_2_1_38_1","volume-title":"Efros","author":"Zhu Jun-Yan","year":"2017","unstructured":"Jun-Yan Zhu, Taesung Park, Phillip Isola, and Alexei A. Efros. 2017. Unpaired Image-to-Image Translation Using Cycle-Consistent Adversarial Networks. ICCV (2017), 2242--2251."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612095","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612095","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:05:33Z","timestamp":1755821133000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612095"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":38,"alternative-id":["10.1145\/3581783.3612095","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612095","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}