{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T19:32:22Z","timestamp":1777059142671,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":53,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2022YFC3600402"],"award-info":[{"award-number":["2022YFC3600402"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681367","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:49Z","timestamp":1729925989000},"page":"5270-5279","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["Shapley Value-based Contrastive Alignment for Multimodal Information Extraction"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-8766-3073","authenticated-orcid":false,"given":"Wen","family":"Luo","sequence":"first","affiliation":[{"name":"State Key Laboratory of Multimedia Information Processing, Peking University &amp; School of Computer Science, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8760-4397","authenticated-orcid":false,"given":"Yu","family":"Xia","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Multimedia Information Processing, Peking University &amp; School of Computer Science, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-0333-6588","authenticated-orcid":false,"given":"Shen","family":"Tianshu","sequence":"additional","affiliation":[{"name":"Wangxuan Institute of Computer Technology, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7493-0786","authenticated-orcid":false,"given":"Sujian","family":"Li","sequence":"additional","affiliation":[{"name":"School of Computer Science, Peking University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings of the 27th International Conference on Computational Linguistics. Association for Computational Linguistics","author":"Akbik Alan","year":"2018","unstructured":"Alan Akbik, Duncan Blythe, and Roland Vollgraf. 2018. Contextual String Embeddings for Sequence Labeling. In Proceedings of the 27th International Conference on Computational Linguistics. Association for Computational Linguistics, Santa Fe, New Mexico, USA, 1638--1649. https:\/\/aclanthology.org\/C18--1139"},{"key":"e_1_3_2_1_2_1","first-page":"23716","article-title":"Flamingo: a visual language model for few-shot learning","volume":"35","author":"Alayrac Jean-Baptiste","year":"2022","unstructured":"Jean-Baptiste Alayrac, Jeff Donahue, Pauline Luc, Antoine Miech, Iain Barr, Yana Hasson, Karel Lenc, Arthur Mensch, Katherine Millican, Malcolm Reynolds, et al. 2022. Flamingo: a visual language model for few-shot learning. Advances in Neural Information Processing Systems, Vol. 35 (2022), 23716--23736.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2019.00061"},{"key":"e_1_3_2_1_5_1","volume-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Yuhao Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems, Vol. 33 (2020), 12449--12460."},{"key":"e_1_3_2_1_6_1","volume-title":"Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966 (2023)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1279"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cor.2008.04.004"},{"key":"e_1_3_2_1_9_1","volume-title":"Learning Implicit Entity-object Relations by Bidirectional Generative Alignment for Multimodal NER. arXiv preprint arXiv:2308.02570","author":"Chen Feng","year":"2023","unstructured":"Feng Chen, Jiajia Liu, Kaixiang Ji, Wang Ren, Jian Wang, and Jingdong Wang. 2023. Learning Implicit Entity-object Relations by Bidirectional Generative Alignment for Multimodal NER. arXiv preprint arXiv:2308.02570 (2023)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3531992"},{"key":"e_1_3_2_1_11_1","volume-title":"Good visual guidance makes a better extractor: Hierarchical visual prefix for multimodal entity and relation extraction. arXiv preprint arXiv:2205.03521","author":"Chen Xiang","year":"2022","unstructured":"Xiang Chen, Ningyu Zhang, Lei Li, Yunzhi Yao, Shumin Deng, Chuanqi Tan, Fei Huang, Luo Si, and Huajun Chen. 2022. Good visual guidance makes a better extractor: Hierarchical visual prefix for multimodal entity and relation extraction. arXiv preprint arXiv:2205.03521 (2022)."},{"key":"e_1_3_2_1_12_1","volume-title":"Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven Hoi.","author":"Dai Wenliang","year":"2023","unstructured":"Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven Hoi. 2023. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. arxiv: 2305.06500 [cs.CV]"},{"key":"e_1_3_2_1_13_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1007\/BF01780630"},{"key":"e_1_3_2_1_15_1","volume-title":"International conference on machine learning. PMLR, 2242--2251","author":"Ghorbani Amirata","year":"2019","unstructured":"Amirata Ghorbani and James Zou. 2019. Data shapley: Equitable valuation of data for machine learning. In International conference on machine learning. PMLR, 2242--2251."},{"key":"e_1_3_2_1_16_1","volume-title":"Bargaining foundations of Shapley value. Econometrica: Journal of the Econometric Society","author":"Gul Faruk","year":"1989","unstructured":"Faruk Gul. 1989. Bargaining foundations of Shapley value. Econometrica: Journal of the Econometric Society (1989), 81--95."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i7.25971"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548427"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.14778\/3342263.3342637"},{"key":"e_1_3_2_1_20_1","volume-title":"The 22nd International Conference on Artificial Intelligence and Statistics. PMLR, 1167--1176","author":"Jia Ruoxi","year":"2019","unstructured":"Ruoxi Jia, David Dao, Boxin Wang, Frances Ann Hubis, Nick Hynes, Nezihe Merve G\u00fcrel, Bo Li, Ce Zhang, Dawn Song, and Costas J Spanos. 2019. Towards efficient data valuation based on the shapley value. In The 22nd International Conference on Artificial Intelligence and Statistics. PMLR, 1167--1176."},{"key":"e_1_3_2_1_21_1","volume-title":"Proceedings of NAACL-HLT. 4171--4186","author":"Ming-Wei Chang Jacob Devlin","year":"2019","unstructured":"Jacob Devlin Ming-Wei Chang Kenton and Lee Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of NAACL-HLT. 4171--4186."},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of the Eighteenth International Conference on Machine Learning. 282--289","author":"Lafferty John D","year":"2001","unstructured":"John D Lafferty, Andrew McCallum, and Fernando CN Pereira. 2001. Conditional Random Fields: Probabilistic Models for Segmenting and Labeling Sequence Data. In Proceedings of the Eighteenth International Conference on Machine Learning. 282--289."},{"key":"e_1_3_2_1_23_1","volume-title":"Multimodal foundation models: From specialists to general-purpose assistants. arXiv preprint arXiv:2309.10020","author":"Li Chunyuan","year":"2023","unstructured":"Chunyuan Li, Zhe Gan, Zhengyuan Yang, Jianwei Yang, Linjie Li, Lijuan Wang, and Jianfeng Gao. 2023. Multimodal foundation models: From specialists to general-purpose assistants. arXiv preprint arXiv:2309.10020, Vol. 1, 2 (2023)."},{"key":"e_1_3_2_1_24_1","volume-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)."},{"key":"e_1_3_2_1_25_1","volume-title":"FLAT: Chinese NER using flat-lattice transformer. arXiv preprint arXiv:2004.11795","author":"Li Xiaonan","year":"2020","unstructured":"Xiaonan Li, Hang Yan, Xipeng Qiu, and Xuanjing Huang. 2020. FLAT: Chinese NER using flat-lattice transformer. arXiv preprint arXiv:2004.11795 (2020)."},{"key":"e_1_3_2_1_26_1","volume-title":"Improved baselines with visual instruction tuning. arXiv preprint arXiv:2310.03744","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Yuheng Li, and Yong Jae Lee. 2023. Improved baselines with visual instruction tuning. arXiv preprint arXiv:2310.03744 (2023)."},{"key":"e_1_3_2_1_27_1","volume-title":"Visual instruction tuning. arXiv preprint arXiv:2304.08485","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual instruction tuning. arXiv preprint arXiv:2304.08485 (2023)."},{"key":"e_1_3_2_1_28_1","unstructured":"Peipei Liu Hong Li Yimo Ren Jie Liu Shuaizong Si Hongsong Zhu and Limin Sun. 2024. Hierarchical Aligned Multimodal Learning for NER on Tweet Posts. arxiv: 2305.08372 [cs.CL]"},{"key":"e_1_3_2_1_29_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1185"},{"key":"e_1_3_2_1_31_1","volume-title":"An empirical study of scaling instruct-tuned large multimodal models. arXiv preprint arXiv:2309.09958","author":"Lu Yadong","year":"2023","unstructured":"Yadong Lu, Chunyuan Li, Haotian Liu, Jianwei Yang, Jianfeng Gao, and Yelong Shen. 2023. An empirical study of scaling instruct-tuned large multimodal models. arXiv preprint arXiv:2309.09958 (2023)."},{"key":"e_1_3_2_1_32_1","volume-title":"Active Contrastive Learning of Audio-Visual Video Representations. In International Conference on Learning Representations.","author":"Ma Shuang","year":"2020","unstructured":"Shuang Ma, Zhaoyang Zeng, Daniel McDuff, and Yale Song. 2020. Active Contrastive Learning of Audio-Visual Video Representations. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_33_1","volume-title":"Bounding the estimation error of sampling-based Shapley value approximation. arXiv preprint arXiv:1306.4265","author":"Maleki Sasan","year":"2013","unstructured":"Sasan Maleki, Long Tran-Thanh, Greg Hines, Talal Rahwan, and Alex Rogers. 2013. Bounding the estimation error of sampling-based Shapley value approximation. arXiv preprint arXiv:1306.4265 (2013)."},{"key":"e_1_3_2_1_34_1","volume-title":"Multimodal named entity recognition for short social media posts. arXiv preprint arXiv:1802.07862","author":"Moon Seungwhan","year":"2018","unstructured":"Seungwhan Moon, Leonardo Neves, and Vitor Carvalho. 2018. Multimodal named entity recognition for short social media posts. arXiv preprint arXiv:1802.07862 (2018)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01229"},{"key":"e_1_3_2_1_36_1","volume-title":"Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748","author":"van den Oord Aaron","year":"2018","unstructured":"Aaron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i15.17633"},{"key":"e_1_3_2_1_38_1","volume-title":"Multimodal Question Answering for Unified Information Extraction. arXiv preprint arXiv:2310.03017","author":"Sun Yuxuan","year":"2023","unstructured":"Yuxuan Sun, Kai Zhang, and Yu Su. 2023. Multimodal Question Answering for Unified Information Extraction. arXiv preprint arXiv:2310.03017 (2023)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.232"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICME52920.2022.9859972"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413650"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3488560.3498475"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.306"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i9.26309"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i16.17687"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11962"},{"key":"e_1_3_2_1_49_1","volume-title":"Svit: Scaling up visual instruction tuning. arXiv preprint arXiv:2307.04087","author":"Zhao Bo","year":"2023","unstructured":"Bo Zhao, Boya Wu, and Tiejun Huang. 2023. Svit: Scaling up visual instruction tuning. arXiv preprint arXiv:2307.04087 (2023)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-emnlp.473"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3476968"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICME51207.2021.9428274"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.3013398"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681367","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681367","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:44Z","timestamp":1750295864000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681367"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":53,"alternative-id":["10.1145\/3664647.3681367","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681367","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}