{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,16]],"date-time":"2026-02-16T18:55:18Z","timestamp":1771268118001,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":40,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T00:00:00Z","timestamp":1717027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,30]]},"DOI":"10.1145\/3652583.3658011","type":"proceedings-article","created":{"date-parts":[[2024,6,7]],"date-time":"2024-06-07T06:30:40Z","timestamp":1717741840000},"page":"795-803","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Contrastive Pre-training with Multi-level Alignment for Grounded Multimodal Named Entity Recognition"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-3250-2403","authenticated-orcid":false,"given":"Xigang","family":"Bao","sequence":"first","affiliation":[{"name":"School of Information, Renmin University of China, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-2637-2628","authenticated-orcid":false,"given":"Mengyuan","family":"Tian","sequence":"additional","affiliation":[{"name":"School of Information, Renmin University of China, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-0565-3838","authenticated-orcid":false,"given":"Luyao","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Information, Renmin University of China, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8702-4088","authenticated-orcid":false,"given":"Zhiyuan","family":"Zha","sequence":"additional","affiliation":[{"name":"School of Information, Renmin University of China, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4304-675X","authenticated-orcid":false,"given":"Biao","family":"Qin","sequence":"additional","affiliation":[{"name":"School of Information, Renmin University of China, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,6,7]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3583780.3614975"},{"key":"e_1_3_2_1_3_1","volume-title":"Can images help recognize entities? A study of the role of images for Multimodal NER. arXiv preprint arXiv:2010.12712","author":"Chen Shuguang","year":"2020","unstructured":"Shuguang Chen, Gustavo Aguilar, Leonardo Neves, and Thamar Solorio. 2020. Can images help recognize entities? A study of the role of images for Multimodal NER. arXiv preprint arXiv:2010.12712 (2020)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3531992"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3345146"},{"key":"e_1_3_2_1_6_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_7_1","unstructured":"Li Fei-Fei Hao Su Minh Do Kai Li and Jia Deng. 2009. Construction and Analysis of a Large Scale Image Ontology. (2009)."},{"key":"e_1_3_2_1_8_1","volume-title":"Stochastic neighbor embedding. Advances in neural information processing systems","author":"Hinton Geoffrey E","year":"2002","unstructured":"Geoffrey E Hinton and Sam Roweis. 2002. Stochastic neighbor embedding. Advances in neural information processing systems , Vol. 15 (2002)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611899"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i7.25971"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548427"},{"key":"e_1_3_2_1_12_1","volume-title":"Proceedings of naacL-HLT","volume":"1","author":"Ming-Wei Chang Jacob Devlin","year":"2019","unstructured":"Jacob Devlin Ming-Wei Chang Kenton and Lee Kristina Toutanova. 2019. Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of naacL-HLT, Vol. 1. 2."},{"key":"e_1_3_2_1_13_1","volume-title":"Bart: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. arXiv preprint arXiv:1910.13461","author":"Lewis Mike","year":"2019","unstructured":"Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov, and Luke Zettlemoyer. 2019. Bart: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. arXiv preprint arXiv:1910.13461 (2019)."},{"key":"e_1_3_2_1_14_1","volume-title":"International Conference on Machine Learning. PMLR, 12888--12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International Conference on Machine Learning. PMLR, 12888--12900."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1185"},{"key":"e_1_3_2_1_16_1","volume-title":"Flat multi-modal interaction transformer for named entity recognition. arXiv preprint arXiv:2208.11039","author":"Lu Junyu","year":"2022","unstructured":"Junyu Lu, Dixiang Zhang, and Pingjian Zhang. 2022. Flat multi-modal interaction transformer for named entity recognition. arXiv preprint arXiv:2208.11039 (2022)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.9"},{"key":"e_1_3_2_1_18_1","volume-title":"Multimodal named entity recognition for short social media posts. arXiv preprint arXiv:1802.07862","author":"Moon Seungwhan","year":"2018","unstructured":"Seungwhan Moon, Leonardo Neves, and Vitor Carvalho. 2018. Multimodal named entity recognition for short social media posts. arXiv preprint arXiv:1802.07862 (2018)."},{"key":"e_1_3_2_1_19_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_21_1","first-page":"25278","article-title":"Laion-5b: An open large-scale dataset for training next generation image-text models","volume":"35","author":"Schuhmann Christoph","year":"2022","unstructured":"Christoph Schuhmann, Romain Beaumont, Richard Vencu, Cade Gordon, Ross Wightman, Mehdi Cherti, Theo Coombes, Aarush Katta, Clayton Mullis, Mitchell Wortsman, et al. 2022. Laion-5b: An open large-scale dataset for training next generation image-text models. Advances in Neural Information Processing Systems , Vol. 35 (2022), 25278--25294.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i15.17633"},{"key":"e_1_3_2_1_23_1","volume-title":"Proceedings of the conference. Association for Computational Linguistics. Meeting","volume":"2019","author":"Hubert Tsai Yao-Hung","year":"2019","unstructured":"Yao-Hung Hubert Tsai, Shaojie Bai, Paul Pu Liang, J Zico Kolter, Louis-Philippe Morency, and Ruslan Salakhutdinov. 2019. Multimodal transformer for unaligned multimodal language sequences. In Proceedings of the conference. Association for Computational Linguistics. Meeting, Vol. 2019. NIH Public Access, 6558."},{"key":"e_1_3_2_1_24_1","volume-title":"Ita: image-text alignments for multi-modal named entity recognition. arXiv preprint arXiv:2112.06482","author":"Wang Xinyu","year":"2021","unstructured":"Xinyu Wang, Min Gui, Yong Jiang, Zixia Jia, Nguyen Bach, Tao Wang, Zhongqiang Huang, Fei Huang, and Kewei Tu. 2021. Ita: image-text alignments for multi-modal named entity recognition. arXiv preprint arXiv:2112.06482 (2021)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICME52920.2022.9859972"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413650"},{"key":"e_1_3_2_1_27_1","volume-title":"Proceedings of the 29th International Conference on Computational Linguistics. 1855--1864","author":"Xu Bo","year":"2022","unstructured":"Bo Xu, Shizhou Huang, Ming Du, Hongya Wang, Hui Song, Chaofeng Sha, and Yanghua Xiao. 2022a. Different data, different modalities! reinforced data splitting for effective multimodal information extraction from social media posts. In Proceedings of the 29th International Conference on Computational Linguistics. 1855--1864."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3488560.3498475"},{"key":"e_1_3_2_1_29_1","volume-title":"A unified generative framework for various NER subtasks. arXiv preprint arXiv:2106.01223","author":"Yan Hang","year":"2021","unstructured":"Hang Yan, Tao Gui, Junqi Dai, Qipeng Guo, Zheng Zhang, and Xipeng Qiu. 2021. A unified generative framework for various NER subtasks. arXiv preprint arXiv:2106.01223 (2021)."},{"key":"e_1_3_2_1_30_1","volume-title":"Improving multimodal named entity recognition via entity span detection with unified multimodal transformer","author":"Yu Jianfei","unstructured":"Jianfei Yu, Jing Jiang, Li Yang, and Rui Xia. 2020. Improving multimodal named entity recognition via entity span detection with unified multimodal transformer. Association for Computational Linguistics."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.508"},{"key":"e_1_3_2_1_32_1","volume-title":"Rethinking diversified and discriminative proposal generation for visual grounding. arXiv preprint arXiv:1805.03508","author":"Yu Zhou","year":"2018","unstructured":"Zhou Yu, Jun Yu, Chenchao Xiang, Zhou Zhao, Qi Tian, and Dacheng Tao. 2018a. Rethinking diversified and discriminative proposal generation for visual grounding. arXiv preprint arXiv:1805.03508 (2018)."},{"key":"e_1_3_2_1_33_1","volume-title":"Rethinking diversified and discriminative proposal generation for visual grounding. arXiv preprint arXiv:1805.03508","author":"Yu Zhou","year":"2018","unstructured":"Zhou Yu, Jun Yu, Chenchao Xiang, Zhou Zhao, Qi Tian, and Dacheng Tao. 2018b. Rethinking diversified and discriminative proposal generation for visual grounding. arXiv preprint arXiv:1805.03508 (2018)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i16.17687"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00089"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11962"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3539597.3570485"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.376"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3476968"}],"event":{"name":"ICMR '24: International Conference on Multimedia Retrieval","location":"Phuket Thailand","acronym":"ICMR '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia","SIGSOFT ACM Special Interest Group on Software Engineering"]},"container-title":["Proceedings of the 2024 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658011","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3652583.3658011","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T08:47:06Z","timestamp":1755766026000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658011"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,30]]},"references-count":40,"alternative-id":["10.1145\/3652583.3658011","10.1145\/3652583"],"URL":"https:\/\/doi.org\/10.1145\/3652583.3658011","relation":{},"subject":[],"published":{"date-parts":[[2024,5,30]]},"assertion":[{"value":"2024-06-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}