{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T10:03:38Z","timestamp":1775815418011,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":71,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,4,22]],"date-time":"2025-04-22T00:00:00Z","timestamp":1745280000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Project Funded by the Priority Academic Program Development of Jiangsu Higher Education Institutions"},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62006166,62376178,62076175"],"award-info":[{"award-number":["62006166,62376178,62076175"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,4,28]]},"DOI":"10.1145\/3696410.3714617","type":"proceedings-article","created":{"date-parts":[[2025,5,5]],"date-time":"2025-05-05T16:42:02Z","timestamp":1746463322000},"page":"4004-4013","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Sherlock: Towards Multi-scene Video Abnormal Event Extraction and Localization via a Global-local Spatial-sensitive LLM"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-6437-2061","authenticated-orcid":false,"given":"Junxiao","family":"Ma","sequence":"first","affiliation":[{"name":"School of Computer Science and Technology, Soochow University, Suzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-3619-1525","authenticated-orcid":false,"given":"Jingjing","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Soochow University, Suzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2144-6921","authenticated-orcid":false,"given":"Jiamin","family":"Luo","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Soochow University, Suzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-7989-135X","authenticated-orcid":false,"given":"Peiying","family":"Yu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Soochow University, Suzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7887-5099","authenticated-orcid":false,"given":"Guodong","family":"Zhou","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Soochow University, Suzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,4,22]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Jamie Ryan Kiros, and Geoffrey E Hinton","author":"Ba Jimmy Lei","year":"2016","unstructured":"Jimmy Lei Ba, Jamie Ryan Kiros, and Geoffrey E Hinton. 2016. Layer normalization. arXiv preprint arXiv:1607.06450 (2016)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P16-1167"},{"key":"e_1_3_2_1_3_1","volume-title":"Revisiting Referring Expression Comprehension Evaluation in the Era of Large Multimodal Models. CoRR abs\/2406.16866","author":"Chen Jierun","year":"2024","unstructured":"Jierun Chen, Fangyun Wei, Jinjing Zhao, Sizhe Song, Bohuai Wu, Zhuoxuan Peng, S.-H. Gary Chan, and Hongyang Zhang. 2024. Revisiting Referring Expression Comprehension Evaluation in the Era of Large Multimodal Models. CoRR abs\/2406.16866 (2024)."},{"key":"e_1_3_2_1_4_1","volume-title":"Proceedings of ICLR","author":"Chen Xiangning","year":"2022","unstructured":"Xiangning Chen, Cho-Jui Hsieh, and Boqing Gong. 2022. When Vision Transformers Outperform ResNets without Pre-training or Strong Data Augmentations. In Proceedings of ICLR 2022."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.338"},{"key":"e_1_3_2_1_6_1","volume-title":"InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks. CoRR abs\/2312.14238","author":"Chen Zhe","year":"2023","unstructured":"Zhe Chen, Jiannan Wu, Wenhai Wang, Weijie Su, Guo Chen, Sen Xing, Muyan Zhong, Qinglong Zhang, Xizhou Zhu, Lewei Lu, Bin Li, Ping Luo, Tong Lu, Yu Qiao, and Jifeng Dai. 2023. InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks. CoRR abs\/2312.14238 (2023)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00543"},{"key":"e_1_3_2_1_8_1","volume-title":"Proceedings of ICPR","author":"Cheng Ming","year":"2020","unstructured":"Ming Cheng, Kunjing Cai, and Ming Li. 2020. RWF-2000: An Open Large Scale Video Database for Violence Detection. In Proceedings of ICPR 2020. 4183--4190."},{"key":"e_1_3_2_1_9_1","volume-title":"Xing","author":"Chiang Wei-Lin","year":"2023","unstructured":"Wei-Lin Chiang, Zhuohan Li, Zi Lin, Ying Sheng, Zhanghao Wu, Hao Zhang, Lianmin Zheng, Siyuan Zhuang, Yonghao Zhuang, Joseph E. Gonzalez, Ion Stoica, and Eric P. Xing. 2023. Vicuna: An Open-Source Chatbot Impressing GPT-4 with 90%* ChatGPT Quality. https:\/\/lmsys.org\/blog\/2023-03--30-vicuna\/"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3268066"},{"key":"e_1_3_2_1_11_1","volume-title":"Why and How: A Comprehensive Benchmark for Causation Understanding of Video Anomaly. CoRR abs\/2405.00181","author":"Du Hang","year":"2024","unstructured":"Hang Du, Sicheng Zhang, Binzhu Xie, Guoshun Nan, Jiayang Zhang, Junrui Xu, Hangyu Liu, Sicong Leng, Jiangming Liu, Hehe Fan, Dajiu Huang, Jing Feng, Linli Chen, Can Zhang, Xuhuan Li, Hao Zhang, Jianhang Chen, Qimei Cui, and Xiaofeng Tao. 2024. Uncovering What, Why and How: A Comprehensive Benchmark for Causation Understanding of Video Anomaly. CoRR abs\/2405.00181 (2024)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01379"},{"key":"e_1_3_2_1_13_1","unstructured":"Xiaoya Gao Jingjing Wang Shoushan Li Min Zhang and Guodong Zhou. 2022. Cognition-driven multimodal personality classification. Sci. China Inf. Sci. (2022)."},{"key":"e_1_3_2_1_14_1","volume-title":"SanMiguel","author":"Garcia-Cobo Guillermo","year":"2023","unstructured":"Guillermo Garcia-Cobo and Juan C. SanMiguel. 2023. Human skeletons and change detection for efficient violence detection in surveillance videos. Comput. Vis. Image Underst. 233 (2023)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00033"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00509"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ROBIO.2013.6739466"},{"key":"e_1_3_2_1_19_1","volume-title":"OneLLM: One Framework to Align All Modalities with Language. CoRR abs\/2312.03700","author":"Han Jiaming","year":"2023","unstructured":"Jiaming Han, Kaixiong Gong, Yiyuan Zhang, Jiaqi Wang, Kaipeng Zhang, Dahua Lin, Yu Qiao, Peng Gao, and Xiangyu Yue. 2023. OneLLM: One Framework to Align All Modalities with Language. CoRR abs\/2312.03700 (2023)."},{"key":"e_1_3_2_1_20_1","volume-title":"Proceedings of ECCV","author":"Davis Larry Hanson Alex Krishnagopal Sanjukta","year":"2019","unstructured":"Krishnagopal Sanjukta Davis Larry Hanson Alex, PNVR Koutilya. 2019. Bidirectional Convolutional LSTM for the Detection of Violence in Videos. In Proceedings of ECCV 2018. 280--295."},{"key":"e_1_3_2_1_21_1","volume-title":"Proceedings of CVPR","author":"Hasan Mahmudul","year":"2016","unstructured":"Mahmudul Hasan, Jonghyun Choi, Jan Neumann, Amit K. Roy-Chowdhury, and Larry S. Davis. 2016. Learning Temporal Regularity in Video Sequences. In Proceedings of CVPR 2016. 733--742."},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of ACL","author":"Hong Yu","year":"2011","unstructured":"Yu Hong, Jianfeng Zhang, Bin Ma, Jian-Min Yao, Guodong Zhou, and Qiaoming Zhu. 2011. Using Cross-Entity Inference to Improve Event Extraction. In Proceedings of ACL 2011. 1127--1136."},{"key":"e_1_3_2_1_23_1","volume-title":"Proceedings of ICLR","author":"Hu Edward J.","year":"2022","unstructured":"Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2022. LoRA: Low-Rank Adaptation of Large Language Models. In Proceedings of ICLR 2022."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1991.3.1.79"},{"key":"e_1_3_2_1_25_1","volume-title":"Mixture of Nested Experts: Adaptive Processing of Visual Tokens. CoRR abs\/2407.19985","author":"Jain Gagan","year":"2024","unstructured":"Gagan Jain, Nidhi Hegde, Aditya Kusupati, Arsha Nagrani, Shyamal Buch, Prateek Jain, Anurag Arnab, and Sujoy Paul. 2024. Mixture of Nested Experts: Adaptive Processing of Visual Tokens. CoRR abs\/2407.19985 (2024)."},{"key":"e_1_3_2_1_26_1","volume-title":"Proceedings of ACL","author":"Ji Heng","year":"2008","unstructured":"Heng Ji and Ralph Grishman. 2008. Refining Event Extraction through Cross-Document Inference. In Proceedings of ACL 2008. 254--262."},{"key":"e_1_3_2_1_27_1","volume-title":"Chat-UniVi: Unified Visual Representation Empowers Large Language Models with Image and Video Understanding. CoRR abs\/2311.08046","author":"Jin Peng","year":"2023","unstructured":"Peng Jin, Ryuichi Takanobu, Caiwan Zhang, Xiaochun Cao, and Li Yuan. 2023. Chat-UniVi: Unified Visual Representation Empowers Large Language Models with Image and Video Understanding. CoRR abs\/2311.08046 (2023)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00907"},{"key":"e_1_3_2_1_29_1","volume-title":"Segment Anything. In Proceedings of ICCV","author":"Kirillov Alexander","year":"2023","unstructured":"Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chlo\u00e9 Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alexander C. Berg, Wan-Yen Lo, Piotr Doll\u00e1r, and Ross B. Girshick. 2023. Segment Anything. In Proceedings of ICCV 2023. 3992--4003."},{"key":"e_1_3_2_1_30_1","volume-title":"Anomaly Locality in Video Surveillance. CoRR abs\/1901.10364","author":"Landi Federico","year":"2019","unstructured":"Federico Landi, Cees G. M. Snoek, and Rita Cucchiara. 2019. Anomaly Locality in Video Surveillance. CoRR abs\/1901.10364 (2019)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.849"},{"key":"e_1_3_2_1_32_1","volume-title":"Otter: A Multi-Modal Model with In-Context Instruction Tuning. CoRR abs\/2305.03726","author":"Li Bo","year":"2023","unstructured":"Bo Li, Yuanhan Zhang, Liangyu Chen, Jinghao Wang, Jingkang Yang, and Ziwei Liu. 2023. Otter: A Multi-Modal Model with In-Context Instruction Tuning. CoRR abs\/2305.03726 (2023)."},{"key":"e_1_3_2_1_33_1","volume-title":"RSI-CB: A Large-Scale Remote Sensing Image Classification Benchmark Using Crowdsourced Data. Sensors 20","author":"Li Haifeng","year":"2020","unstructured":"Haifeng Li, Xin Dou, Chao Tao, Zhixiang Wu, Jie Chen, Jian Peng, Min Deng, and Ling Zhao. 2020. RSI-CB: A Large-Scale Remote Sensing Image Classification Benchmark Using Crowdsourced Data. Sensors 20 (2020)."},{"key":"e_1_3_2_1_34_1","volume-title":"Proceedings of ICML 2023. 1973","author":"Li Junnan","year":"1974","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven C. H. Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. In Proceedings of ICML 2023. 19730--19742."},{"key":"e_1_3_2_1_35_1","volume-title":"VideoChat: Chat-Centric Video Understanding. CoRR abs\/2305.06355","author":"Li Kunchang","year":"2023","unstructured":"Kunchang Li, Yinan He, Yi Wang, Yizhuo Li, Wenhai Wang, Ping Luo, Yali Wang, Limin Wang, and Yu Qiao. 2023. VideoChat: Chat-Centric Video Understanding. CoRR abs\/2305.06355 (2023)."},{"key":"e_1_3_2_1_36_1","volume-title":"Proceedings of ACL","author":"Li Qi","year":"2013","unstructured":"Qi Li, Heng Ji, and Liang Huang. 2013. Joint Event Extraction via Structured Prediction with Global Features. In Proceedings of ACL 2013. 73--82."},{"key":"e_1_3_2_1_37_1","volume-title":"Proceedings of ACL","author":"Liao Shasha","year":"2010","unstructured":"Shasha Liao and Ralph Grishman. 2010. Using Document Level Cross-Event Inference to Improve Event Extraction. In Proceedings of ACL 2010. 789--797."},{"key":"e_1_3_2_1_38_1","volume-title":"Video-LLaVA: Learning United Visual Representation by Alignment Before Projection. CoRR abs\/2311.10122","author":"Lin Bin","year":"2023","unstructured":"Bin Lin, Yang Ye, Bin Zhu, Jiaxi Cui, Munan Ning, Peng Jin, and Li Yuan. 2023. Video-LLaVA: Learning United Visual Representation by Alignment Before Projection. CoRR abs\/2311.10122 (2023)."},{"key":"e_1_3_2_1_39_1","volume-title":"Proceedings of ECCV","author":"Lin Tsung-Yi","year":"2014","unstructured":"Tsung-Yi Lin, Michael Maire, Serge J. Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C. Lawrence Zitnick. 2014. Microsoft COCO: Common Objects in Context. In Proceedings of ECCV 2014. 740--755."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02212"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350998"},{"key":"e_1_3_2_1_42_1","volume-title":"MOELoRA: An MOE-based Parameter Efficient Fine-Tuning Method for Multi-task Medical Applications. CoRR abs\/2310.18339","author":"Liu Qidong","year":"2023","unstructured":"Qidong Liu, Xian Wu, Xiangyu Zhao, Yuanshao Zhu, Derong Xu, Feng Tian, and Yefeng Zheng. 2023. MOELoRA: An MOE-based Parameter Efficient Fine-Tuning Method for Multi-task Medical Applications. CoRR abs\/2310.18339 (2023)."},{"key":"e_1_3_2_1_43_1","volume-title":"Proceedings of ICLR","author":"Loshchilov Ilya","year":"2019","unstructured":"Ilya Loshchilov and Frank Hutter. 2019. Decoupled Weight Decay Regularization. In Proceedings of ICLR 2019."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_51"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.338"},{"key":"e_1_3_2_1_46_1","volume-title":"Valley: Video Assistant with Large Language model Enhanced abilitY. CoRR abs\/2306.07207","author":"Luo Ruipu","year":"2023","unstructured":"Ruipu Luo, Ziwang Zhao, Min Yang, Junwei Dong, Minghui Qiu, Pengcheng Lu, Tao Wang, and Zhongyu Wei. 2023. Valley: Video Assistant with Large Language model Enhanced abilitY. CoRR abs\/2306.07207 (2023)."},{"key":"e_1_3_2_1_47_1","volume-title":"Salman H. Khan, and Fahad Shahbaz Khan.","author":"Maaz Muhammad","year":"2023","unstructured":"Muhammad Maaz, Hanoona Abdul Rasheed, Salman H. Khan, and Fahad Shahbaz Khan. 2023. Video-ChatGPT: Towards Detailed Video Understanding via Large Vision and Language Models. CoRR abs\/2306.05424 (2023)."},{"key":"e_1_3_2_1_48_1","volume-title":"Video-Bench: A Comprehensive Benchmark and Toolkit for Evaluating Video-based Large Language Models. CoRR abs\/2311.16103","author":"Ning Munan","year":"2023","unstructured":"Munan Ning, Bin Zhu, Yujia Xie, Bin Lin, Jiaxi Cui, Lu Yuan, Dongdong Chen, and Li Yuan. 2023. Video-Bench: A Comprehensive Benchmark and Toolkit for Evaluating Video-based Large Language Models. CoRR abs\/2311.16103 (2023)."},{"key":"e_1_3_2_1_50_1","volume-title":"Proceedings of ICLR","author":"Puigcerver Joan","year":"2024","unstructured":"Joan Puigcerver, Carlos Riquelme Ruiz, Basil Mustafa, and Neil Houlsby. 2024. From Sparse to Soft Mixtures of Experts. In Proceedings of ICLR 2024."},{"key":"e_1_3_2_1_51_1","volume-title":"Proceedings of ICML","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In Proceedings of ICML 2021. 8748--8763."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/W15-0812"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/AVSS56176.2022.9959393"},{"key":"e_1_3_2_1_54_1","volume-title":"PandaGPT: One Model To Instruction-Follow Them All. CoRR abs\/2305.16355","author":"Su Yixuan","year":"2023","unstructured":"Yixuan Su, Tian Lan, Huayang Li, Jialu Xu, Yan Wang, and Deng Cai. 2023. PandaGPT: One Model To Instruction-Follow Them All. CoRR abs\/2305.16355 (2023)."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58548-8_5"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00678"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2018\/617"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1345"},{"key":"e_1_3_2_1_59_1","volume-title":"Video-GroundingDINO: Towards Open-Vocabulary Spatio-Temporal Video Grounding. CoRR abs\/2401.00901","author":"Wasim Syed Talal","year":"2024","unstructured":"Syed Talal Wasim, Muzammal Naseer, Salman Khan, Ming-Hsuan Yang, and Fahad Shahbaz Khan. 2024. Video-GroundingDINO: Towards Open-Vocabulary Spatio-Temporal Video Grounding. CoRR abs\/2401.00901 (2024)."},{"key":"e_1_3_2_1_60_1","volume-title":"UniRef: Segment Every Reference Object in Spatial and Temporal Spaces. arXiv preprint arXiv:2312.15715","author":"Wu Jiannan","year":"2023","unstructured":"Jiannan Wu, Yi Jiang, Bin Yan, Huchuan Lu, Zehuan Yuan, and Ping Luo. 2023. UniRef: Segment Every Reference Object in Spatial and Temporal Spaces. arXiv preprint arXiv:2312.15715 (2023)."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i6.28423"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2016.10.010"},{"key":"e_1_3_2_1_63_1","volume-title":"ManiFoundation Model for General-Purpose Robotic Manipulation of Contact Synthesis with Arbitrary Objects and Robots. CoRR","author":"Xu Zhixuan","year":"2024","unstructured":"Zhixuan Xu, Chongkai Gao, Zixuan Liu, Gang Yang, Chenrui Tie, Haozhuo Zheng, Haoyu Zhou, Weikun Peng, Debang Wang, Tianyi Chen, Zhouliang Yu, and Lin Shao. 2024. ManiFoundation Model for General-Purpose Robotic Manipulation of Contact Synthesis with Arbitrary Objects and Robots. CoRR (2024)."},{"key":"e_1_3_2_1_64_1","volume-title":"mPLUG-Owl: Modularization Empowers Large Language Models with Multimodality. CoRR abs\/2304.14178","author":"Ye Qinghao","year":"2023","unstructured":"Qinghao Ye, Haiyang Xu, Guohai Xu, Jiabo Ye, Ming Yan, Yiyang Zhou, Junyang Wang, Anwen Hu, Pengcheng Shi, Yaya Shi, Chenliang Li, Yuanhong Xu, Hehong Chen, Junfeng Tian, Qian Qi, Ji Zhang, and Fei Huang. 2023. mPLUG-Owl: Modularization Empowers Large Language Models with Multimodality. CoRR abs\/2304.14178 (2023)."},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.331"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.454"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548007"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681407"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00133"},{"key":"e_1_3_2_1_72_1","volume-title":"Proceedings of ICLR","author":"Zhu Bin","year":"2024","unstructured":"Bin Zhu, Bin Lin, Munan Ning, Yang Yan, Jiaxi Cui, Hongfa Wang, Yatian Pang, Wenhao Jiang, Junwu Zhang, Zongwei Li, Caiwan Zhang, Zhifeng Li, Wei Liu, and Li Yuan. 2024. LanguageBind: Extending Video-Language Pretraining to N-modality by Language-based Semantic Alignment. In Proceedings of ICLR 2024."}],"event":{"name":"WWW '25: The ACM Web Conference 2025","location":"Sydney NSW Australia","acronym":"WWW '25","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"]},"container-title":["Proceedings of the ACM on Web Conference 2025"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696410.3714617","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3696410.3714617","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:34Z","timestamp":1750295914000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696410.3714617"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,22]]},"references-count":71,"alternative-id":["10.1145\/3696410.3714617","10.1145\/3696410"],"URL":"https:\/\/doi.org\/10.1145\/3696410.3714617","relation":{},"subject":[],"published":{"date-parts":[[2025,4,22]]},"assertion":[{"value":"2025-04-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}