{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,16]],"date-time":"2026-06-16T08:30:28Z","timestamp":1781598628855,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Natural Science Foundation of China","award":["62272232"],"award-info":[{"award-number":["62272232"]}]},{"name":"Natural Science Foundation of China","award":["62006117"],"award-info":[{"award-number":["62006117"]}]},{"name":"Natural Science Foundation of China","award":["62076133"],"award-info":[{"award-number":["62076133"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681598","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:49Z","timestamp":1729925989000},"page":"7336-7345","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["Generative Multimodal Data Augmentation for Low-Resource Multimodal Named Entity Recognition"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-3034-7998","authenticated-orcid":false,"given":"Ziyan","family":"Li","sequence":"first","affiliation":[{"name":"School of Computer Science and Engineering, Nanjing University of Science and Technology, Nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8380-0609","authenticated-orcid":false,"given":"Jianfei","family":"Yu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Nanjing University of Science and Technology, Nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2903-9589","authenticated-orcid":false,"given":"Jia","family":"Yang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Nanjing University of Science and Technology, Nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5612-7818","authenticated-orcid":false,"given":"Wenya","family":"Wang","sequence":"additional","affiliation":[{"name":"College of Computing and Data Science, Nanyang Technological University, Singapore, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7160-9620","authenticated-orcid":false,"given":"Li","family":"Yang","sequence":"additional","affiliation":[{"name":"Wee Kim Wee School of Communication and Information, Nanyang Technological University, Singapore, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0621-1058","authenticated-orcid":false,"given":"Rui","family":"Xia","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Nanjing University of Science and Technology, Nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"CWI: A multimodal deep learning approach for named entity recognition from social media using character, word and image features. Neural Computing and Applications","author":"Asgari-Chenaghlu Meysam","year":"2022","unstructured":"Meysam Asgari-Chenaghlu, M Reza Feizi-Derakhshi, Leili Farzinvash, MA Balafar, and Cina Motamed. 2022. CWI: A multimodal deep learning approach for named entity recognition from social media using character, word and image features. Neural Computing and Applications (2022), 1--18."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3583780.3614975"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.196"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-73197-7_12"},{"key":"e_1_3_2_1_5_1","volume-title":"Chain-of-thought prompt distillation for multimodal named entity and multimodal relation extraction. arXiv preprint arXiv:2306.14122","author":"Chen Feng","year":"2023","unstructured":"Feng Chen and Yujian Feng. 2023. Chain-of-thought prompt distillation for multimodal named entity and multimodal relation extraction. arXiv preprint arXiv:2306.14122 (2023)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612095"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3531992"},{"key":"e_1_3_2_1_8_1","volume-title":"International Conference on Machine Learning. PMLR","author":"Cho Jaemin","year":"2021","unstructured":"Jaemin Cho, Jie Lei, Hao Tan, and Mohit Bansal. 2021. Unifying vision-and-language tasks via text generation. In International Conference on Machine Learning. PMLR, 1931--1942."},{"key":"e_1_3_2_1_9_1","unstructured":"Hyung Won Chung Le Hou Shayne Longpre Barret Zoph Yi Tay William Fedus Yunxuan Li Xuezhi Wang Mostafa Dehghani Siddhartha Brahma et al. 2022. Scaling instruction-finetuned language models. arXiv preprint arXiv:2210.11416 (2022)."},{"key":"e_1_3_2_1_10_1","volume-title":"InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. arXiv","author":"Dai W","year":"2023","unstructured":"W Dai, J Li, D Li, AMH Tiong, J Zhao, W Wang, B Li, P Fung, and S Hoi. 2023. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. arXiv 2023. arXiv preprint arXiv:2305.06500 (2023)."},{"key":"e_1_3_2_1_11_1","volume-title":"An analysis of simple data augmentation for named entity recognition. arXiv preprint arXiv:2010.11683","author":"Dai Xiang","year":"2020","unstructured":"Xiang Dai and Heike Adel. 2020. An analysis of simple data augmentation for named entity recognition. arXiv preprint arXiv:2010.11683 (2020)."},{"key":"e_1_3_2_1_12_1","volume-title":"Shafiq Joty, Luo Si, and Chunyan Miao.","author":"Ding Bosheng","year":"2020","unstructured":"Bosheng Ding, Linlin Liu, Lidong Bing, Canasai Kruengkrai, Thien Hai Nguyen, Shafiq Joty, Luo Si, and Chunyan Miao. 2020. DAGA: Data augmentation with a generation approach for low-resource tagging tasks. arXiv preprint arXiv:2011.01549 (2020)."},{"key":"e_1_3_2_1_13_1","volume-title":"Hierarchical neural story generation. arXiv preprint arXiv:1805.04833","author":"Fan Angela","year":"2018","unstructured":"Angela Fan, Mike Lewis, and Yann Dauphin. 2018. Hierarchical neural story generation. arXiv preprint arXiv:1805.04833 (2018)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01855"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00294"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACVW58289.2023.00042"},{"key":"e_1_3_2_1_17_1","volume-title":"The curious case of neural text degeneration. arXiv preprint arXiv:1904.09751","author":"Holtzman Ari","year":"2019","unstructured":"Ari Holtzman, Jan Buys, Li Du, Maxwell Forbes, and Yejin Choi. 2019. The curious case of neural text degeneration. arXiv preprint arXiv:1904.09751 (2019)."},{"key":"e_1_3_2_1_18_1","volume-title":"Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685","author":"Hu Edward J","year":"2021","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2021. Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548427"},{"key":"e_1_3_2_1_20_1","volume-title":"A diversity-promoting objective function for neural conversation models. arXiv preprint arXiv:1510.03055","author":"Li Jiwei","year":"2015","unstructured":"Jiwei Li, Michel Galley, Chris Brockett, Jianfeng Gao, and Bill Dolan. 2015. A diversity-promoting objective function for neural conversation models. arXiv preprint arXiv:1510.03055 (2015)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.184"},{"key":"e_1_3_2_1_22_1","volume-title":"LLMs as Bridges: Reformulating Grounded Multimodal Named Entity Recognition. arXiv preprint arXiv:2402.09989","author":"Li Jinyuan","year":"2024","unstructured":"Jinyuan Li, Han Li, Di Sun, Jiahao Wang, Wenkun Zhang, Zan Wang, and Gang Pan. 2024. LLMs as Bridges: Reformulating Grounded Multimodal Named Entity Recognition. arXiv preprint arXiv:2402.09989 (2024)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1185"},{"key":"e_1_3_2_1_24_1","volume-title":"Syntactic data augmentation increases robustness to inference heuristics. arXiv preprint arXiv:2004.11999","author":"Min Junghyun","year":"2020","unstructured":"Junghyun Min, R Thomas McCoy, Dipanjan Das, Emily Pitler, and Tal Linzen. 2020. Syntactic data augmentation increases robustness to inference heuristics. arXiv preprint arXiv:2004.11999 (2020)."},{"key":"e_1_3_2_1_25_1","volume-title":"Multimodal named entity recognition for short social media posts. arXiv preprint arXiv:1802.07862","author":"Moon Seungwhan","year":"2018","unstructured":"Seungwhan Moon, Leonardo Neves, and Vitor Carvalho. 2018. Multimodal named entity recognition for short social media posts. arXiv preprint arXiv:1802.07862 (2018)."},{"key":"e_1_3_2_1_26_1","volume-title":"SCANNER: Knowledge-Enhanced Approach for Robust Multi-modal Named Entity Recognition of Unseen Entities. arXiv preprint arXiv:2404.01914","author":"Ok Hyunjong","year":"2024","unstructured":"Hyunjong Ok, Taeho Kil, Sukmin Seo, and Jaeho Lee. 2024. SCANNER: Knowledge-Enhanced Approach for Robust Multi-modal Named Entity Recognition of Unseen Entities. arXiv preprint arXiv:2404.01914 (2024)."},{"key":"e_1_3_2_1_27_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_29_1","volume-title":"Data augmentation via dependency tree morphing for low-resource languages. arXiv preprint arXiv:1903.09460","author":"Sahin G\u00f6zde G\u00fcl","year":"2019","unstructured":"G\u00f6zde G\u00fcl cSahin and Mark Steedman. 2019. Data augmentation via dependency tree morphing for low-resource languages. arXiv preprint arXiv:1903.09460 (2019)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612322"},{"key":"e_1_3_2_1_31_1","volume-title":"Named entity and relation extraction with multi-modal retrieval. arXiv preprint arXiv:2212.01612","author":"Wang Xinyu","year":"2022","unstructured":"Xinyu Wang, Jiong Cai, Yong Jiang, Pengjun Xie, Kewei Tu, and Wei Lu. 2022. Named entity and relation extraction with multi-modal retrieval. arXiv preprint arXiv:2212.01612 (2022)."},{"key":"e_1_3_2_1_32_1","volume-title":"Ita: image-text alignments for multi-modal named entity recognition. arXiv preprint arXiv:2112.06482","author":"Wang Xinyu","year":"2021","unstructured":"Xinyu Wang, Min Gui, Yong Jiang, Zixia Jia, Nguyen Bach, Tao Wang, Zhongqiang Huang, Fei Huang, and Kewei Tu. 2021. Ita: image-text alignments for multi-modal named entity recognition. arXiv preprint arXiv:2112.06482 (2021)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICME52920.2022.9859972"},{"key":"e_1_3_2_1_34_1","volume-title":"Eda: Easy data augmentation techniques for boosting performance on text classification tasks. arXiv preprint arXiv:1901.11196","author":"Wei Jason","year":"2019","unstructured":"Jason Wei and Kai Zou. 2019. Eda: Easy data augmentation techniques for boosting performance on text classification tasks. arXiv preprint arXiv:1901.11196 (2019)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612470"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413650"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3488560.3498475"},{"key":"e_1_3_2_1_38_1","volume-title":"Improving multimodal named entity recognition via entity span detection with unified multimodal transformer","author":"Yu Jianfei","unstructured":"Jianfei Yu, Jing Jiang, Li Yang, and Rui Xia. 2020. Improving multimodal named entity recognition via entity span detection with unified multimodal transformer. Association for Computational Linguistics."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.508"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i16.17687"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11962"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548228"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3476968"},{"key":"e_1_3_2_1_44_1","volume-title":"Melm: Data augmentation with masked entity language modeling for low-resource ner. arXiv preprint arXiv:2108.13655","author":"Zhou Ran","year":"2021","unstructured":"Ran Zhou, Xin Li, Ruidan He, Lidong Bing, Erik Cambria, Luo Si, and Chunyan Miao. 2021. Melm: Data augmentation with masked entity language modeling for low-resource ner. arXiv preprint arXiv:2108.13655 (2021)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681598","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681598","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:49Z","timestamp":1750295869000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681598"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":44,"alternative-id":["10.1145\/3664647.3681598","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681598","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}