{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:09:36Z","timestamp":1765339776783,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":55,"publisher":"ACM","funder":[{"name":"Beijing Natural Science Foundation","award":["L252036"],"award-info":[{"award-number":["L252036"]}]},{"name":"National Key R&D Program of China","award":["2022YFB2702100, 2022YFB2703102"],"award-info":[{"award-number":["2022YFB2702100, 2022YFB2703102"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62225203, U22A2099"],"award-info":[{"award-number":["62225203, U22A2099"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Beijing Municipal Science and Technology Commission and Zhongguancun Science Park Management Committee","award":["Z231100007423003"],"award-info":[{"award-number":["Z231100007423003"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754955","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T05:56:43Z","timestamp":1761371803000},"page":"7558-7567","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["<scp>Lava<\/scp>\n                    : Language Driven Scalable and Versatile Traffic Video Analytics"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-7543-6507","authenticated-orcid":false,"given":"Yanrui","family":"Yu","sequence":"first","affiliation":[{"name":"Beijing Institute of Technology, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5475-1473","authenticated-orcid":false,"given":"Tianfei","family":"Zhou","sequence":"additional","affiliation":[{"name":"Beijing Institute of Technology, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-2016-9014","authenticated-orcid":false,"given":"Jiaxin","family":"Sun","sequence":"additional","affiliation":[{"name":"Beijing Institute of Technology, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5401-6222","authenticated-orcid":false,"given":"Lianpeng","family":"Qiao","sequence":"additional","affiliation":[{"name":"Beijing Institute of Technology, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-4522-0722","authenticated-orcid":false,"given":"Lizhong","family":"Ding","sequence":"additional","affiliation":[{"name":"Beijing Institute of Technology, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0247-9866","authenticated-orcid":false,"given":"Ye","family":"Yuan","sequence":"additional","affiliation":[{"name":"Beijing Institute of Technology, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0181-8379","authenticated-orcid":false,"given":"Guoren","family":"Wang","sequence":"additional","affiliation":[{"name":"Beijing Institute of Technology, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"BoT-SORT: Robust associations multi-pedestrian tracking. arXiv preprint arXiv:2206.14651","author":"Aharon Nir","year":"2022","unstructured":"Nir Aharon, Roy Orfaig, and Ben-Zion Bobrovsky. 2022. BoT-SORT: Robust associations multi-pedestrian tracking. arXiv preprint arXiv:2206.14651 (2022)."},{"key":"e_1_3_2_1_2_1","volume-title":"Miris: Fast object track queries in video. In SIGMOD.","author":"Bastani Favyen","year":"2020","unstructured":"Favyen Bastani, Songtao He, Arjun Balasingam, Karthik Gopalakrishnan, Mohammad Alizadeh, Hari Balakrishnan, Michael Cafarella, Tim Kraska, and Sam Madden. 2020. Miris: Fast object track queries in video. In SIGMOD."},{"key":"e_1_3_2_1_3_1","volume-title":"OTIF: Efficient tracker pre-processing over large video datasets. In SIGMOD.","author":"Bastani Favyen","year":"2022","unstructured":"Favyen Bastani and Samuel Madden. 2022. OTIF: Efficient tracker pre-processing over large video datasets. In SIGMOD."},{"key":"e_1_3_2_1_4_1","volume-title":"FCM: The fuzzy c-means clustering algorithm. Computers & geosciences","author":"Bezdek James C","year":"1984","unstructured":"James C Bezdek, Robert Ehrlich, and William Full. 1984. FCM: The fuzzy c-means clustering algorithm. Computers & geosciences (1984)."},{"key":"e_1_3_2_1_5_1","volume-title":"Figo: Fine-grained query optimization in video analytics. In SIGMOD.","author":"Cao Jiashen","year":"2022","unstructured":"Jiashen Cao, Karan Sarkar, Ramyad Hadidi, Joy Arulraj, and Hyesoon Kim. 2022. Figo: Fine-grained query optimization in video analytics. In SIGMOD."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2023.3270328"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","unstructured":"Ziliang Chen Xin Huang Quanlong Guan Liang Lin and Weiqi Luo. 2023. A retrospect to multi-prompt learning across vision and language. In ICCV.","DOI":"10.1109\/ICCV51070.2023.02028"},{"key":"e_1_3_2_1_8_1","volume-title":"Yolo-world: Real-time open-vocabulary object detection. In CVPR.","author":"Cheng Tianheng","year":"2024","unstructured":"Tianheng Cheng, Lin Song, Yixiao Ge, Wenyu Liu, Xinggang Wang, and Ying Shan. 2024b. Yolo-world: Real-time open-vocabulary object detection. In CVPR."},{"key":"e_1_3_2_1_9_1","volume-title":"VideoLLaMA 2: Advancing Spatial-Temporal Modeling and Audio Understanding in Video-LLMs. arXiv preprint arXiv:2406.07476","author":"Cheng Zesen","year":"2024","unstructured":"Zesen Cheng, Sicong Leng, Hang Zhang, Yifei Xin, Xin Li, Guanzheng Chen, Yongxin Zhu, Wenqi Zhang, Ziyang Luo, Deli Zhao, and Lidong Bing. 2024a. VideoLLaMA 2: Advancing Spatial-Temporal Modeling and Audio Understanding in Video-LLMs. arXiv preprint arXiv:2406.07476 (2024)."},{"key":"e_1_3_2_1_10_1","unstructured":"Eulrang Cho Jooyeon Kim and Hyunwoo J Kim. 2023. Distribution-aware prompt tuning for vision-language models. In ICCV."},{"key":"e_1_3_2_1_11_1","first-page":"1946","article-title":"Feature re-learning with data augmentation for video relevance prediction","volume":"33","author":"Dong Jianfeng","year":"2019","unstructured":"Jianfeng Dong, Xun Wang, Leimin Zhang, Chaoxi Xu, Gang Yang, and Xirong Li. 2019. Feature re-learning with data augmentation for video relevance prediction. TKDE, Vol. 33, 5 (2019), 1946-1959.","journal-title":"TKDE"},{"key":"e_1_3_2_1_12_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_13_1","volume-title":"Metam: Goal-oriented data discovery. In ICDE.","author":"Galhotra Sainyam","year":"2023","unstructured":"Sainyam Galhotra, Yue Gong, and Raul Castro Fernandez. 2023. Metam: Goal-oriented data discovery. In ICDE."},{"key":"e_1_3_2_1_14_1","volume-title":"FastReID: A Pytorch Toolbox for General Instance Re-identification. arXiv preprint arXiv:2006.02631","author":"He Lingxiao","year":"2020","unstructured":"Lingxiao He, Xingyu Liao, Wu Liu, Xinchen Liu, Peng Cheng, and Tao Mei. 2020. FastReID: A Pytorch Toolbox for General Instance Re-identification. arXiv preprint arXiv:2006.02631 (2020)."},{"key":"e_1_3_2_1_15_1","volume-title":"Multiagent architectures for intelligent traffic management systems. Transportation Research Part C: Emerging Technologies","author":"Hern\u00e1ndez Josefa Z","year":"2002","unstructured":"Josefa Z Hern\u00e1ndez, Sascha Ossowski, and Ana Garcia-Serrano. 2002. Multiagent architectures for intelligent traffic management systems. Transportation Research Part C: Emerging Technologies (2002)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Shiyu Hou Tianfei Zhou Shuai Zhang Ye Yuan and Guoren Wang. 2025. Prompt Tuning In a Compact Attribute Space. In AAAI.","DOI":"10.1609\/aaai.v39i4.32365"},{"key":"e_1_3_2_1_17_1","volume-title":"Focus: Querying large video datasets with low latency and low cost. In OSDI.","author":"Hsieh Kevin","year":"2018","unstructured":"Kevin Hsieh, Ganesh Ananthanarayanan, Peter Bodik, Shivaram Venkataraman, Paramvir Bahl, Matthai Philipose, Phillip B Gibbons, and Onur Mutlu. 2018. Focus: Querying large video datasets with low latency and low cost. In OSDI."},{"key":"e_1_3_2_1_18_1","unstructured":"Aaron Hurst Adam Lerer Adam P Goucher Adam Perelman Aditya Ramesh Aidan Clark AJ Ostrow Akila Welihinda Alan Hayes Alec Radford et al. 2024. Gpt-4o system card. arXiv preprint arXiv:2410.21276 (2024)."},{"key":"e_1_3_2_1_19_1","volume-title":"VLDB","volume":"13","author":"Kang Daniel","year":"2018","unstructured":"Daniel Kang, Peter Bailis, and Matei Zaharia. 2018. BlazeIt: Optimizing Declarative Aggregation and Limit Queries for Neural Network-Based Video Analytics. VLDB, Vol. 13, 4 (2018)."},{"key":"e_1_3_2_1_20_1","unstructured":"Daniel Kang John Emmons Firas Abuzaid Peter Bailis and Matei Zaharia. [n.d.]. NoScope: Optimizing Neural Network Queries over Video at Scale. VLDB ([n.d.])."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"Daniel Kang John Guibas Peter D Bailis Tatsunori Hashimoto and Matei Zaharia. 2022a. TASTI: semantic indexes for machine learning-based queries over unstructured data. In SIGMOD.","DOI":"10.1145\/3514221.3517897"},{"key":"e_1_3_2_1_22_1","volume-title":"VIVA: An End-to-End System for Interactive Video Analytics.. In CIDR.","author":"Kang Daniel","year":"2022","unstructured":"Daniel Kang, Francisco Romero, Peter D Bailis, Christos Kozyrakis, and Matei Zaharia. 2022b. VIVA: An End-to-End System for Interactive Video Analytics.. In CIDR."},{"key":"e_1_3_2_1_23_1","volume-title":"Maple: Multi-modal prompt learning. In CVPR.","author":"Khattak Muhammad Uzair","year":"2023","unstructured":"Muhammad Uzair Khattak, Hanoona Rasheed, Muhammad Maaz, Salman Khan, and Fahad Shahbaz Khan. 2023a. Maple: Multi-modal prompt learning. In CVPR."},{"key":"e_1_3_2_1_24_1","volume-title":"Muzammal Naseer, Salman Khan, Ming-Hsuan Yang, and Fahad Shahbaz Khan.","author":"Khattak Muhammad Uzair","year":"2023","unstructured":"Muhammad Uzair Khattak, Syed Talal Wasim, Muzammal Naseer, Salman Khan, Ming-Hsuan Yang, and Fahad Shahbaz Khan. 2023b. Self-regulating prompts: Foundational model adaptation without forgetting. In CVPR."},{"key":"e_1_3_2_1_25_1","first-page":"5023","article-title":"Video monitoring queries","volume":"34","author":"Koudas Nick","year":"2020","unstructured":"Nick Koudas, Raymond Li, and Ioannis Xarchakos. 2020. Video monitoring queries. TKDE, Vol. 34, 10 (2020), 5023-5036.","journal-title":"TKDE"},{"key":"e_1_3_2_1_26_1","unstructured":"Ziliang Lai Chenxia Han Chris Liu Pengfei Zhang Eric Lo and Ben Kao. 2021. Top-K Deep Video Analytics: A Probabilistic Approach. In SIGMOD."},{"key":"e_1_3_2_1_27_1","unstructured":"Junnan Li Dongxu Li Silvio Savarese and Steven Hoi. 2023b. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In ICML."},{"key":"e_1_3_2_1_28_1","volume-title":"Videochat: Chat-centric video understanding. arXiv preprint arXiv:2305.06355","author":"Li KunChang","year":"2023","unstructured":"KunChang Li, Yinan He, Yi Wang, Yizhuo Li, Wenhai Wang, Ping Luo, Yali Wang, Limin Wang, and Yu Qiao. 2023a. Videochat: Chat-centric video understanding. arXiv preprint arXiv:2305.06355 (2023)."},{"key":"e_1_3_2_1_29_1","first-page":"4538","article-title":"Video super-resolution reconstruction based on deep learning and spatio-temporal feature self-similarity","volume":"34","author":"Liang Meiyu","year":"2020","unstructured":"Meiyu Liang, Junping Du, Linghui Li, Zhe Xue, Xiaoxiao Wang, Feifei Kou, and Xu Wang. 2020. Video super-resolution reconstruction based on deep learning and spatio-temporal feature self-similarity. TKDE, Vol. 34, 9 (2020), 4538-4553.","journal-title":"TKDE"},{"key":"e_1_3_2_1_30_1","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong Jae Lee. 2023. Visual instruction tuning. In NeurIPS."},{"key":"e_1_3_2_1_31_1","volume-title":"Kangaroo: A powerful video-language model supporting long-context video input. arXiv preprint arXiv:2408.15542","author":"Liu Jiajun","year":"2024","unstructured":"Jiajun Liu, Yibing Wang, Hanghang Ma, Xiaoping Wu, Xiaoqi Ma, Xiaoming Wei, Jianbin Jiao, Enhua Wu, and Jie Hu. 2024. Kangaroo: A powerful video-language model supporting long-context video input. arXiv preprint arXiv:2408.15542 (2024)."},{"key":"e_1_3_2_1_32_1","unstructured":"Shilong Liu Zhaoyang Zeng Tianhe Ren Feng Li Hao Zhang Jie Yang Qing Jiang Chunyuan Li Jianwei Yang Hang Su et al. [n.d.]. Grounding dino: Marrying dino with grounded pre-training for open-set object detection. In ECCV."},{"key":"e_1_3_2_1_33_1","volume-title":"Video-RAG: Visually-aligned Retrieval-Augmented Long Video Comprehension. arXiv preprint arXiv:2411.13093","author":"Luo Yongdong","year":"2024","unstructured":"Yongdong Luo, Xiawu Zheng, Xiao Yang, Guilin Li, Haojia Lin, Jinfa Huang, Jiayi Ji, Fei Chao, Jiebo Luo, and Rongrong Ji. 2024. Video-RAG: Visually-aligned Retrieval-Augmented Long Video Comprehension. arXiv preprint arXiv:2411.13093 (2024)."},{"key":"e_1_3_2_1_34_1","volume-title":"Exsample: Efficient searches on video repositories through adaptive sampling. In ICDE.","author":"Moll Oscar","year":"2022","unstructured":"Oscar Moll, Favyen Bastani, Sam Madden, Mike Stonebraker, Vijay Gadepally, and Tim Kraska. 2022. Exsample: Efficient searches on video repositories through adaptive sampling. In ICDE."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"crossref","unstructured":"Oscar Moll Manuel Favela Samuel Madden Vijay Gadepally and Michael Cafarella. 2023. SeeSaw: interactive ad-hoc search over image databases. In SIGMOD.","DOI":"10.1145\/3626754"},{"key":"e_1_3_2_1_36_1","volume-title":"Vikranth Dwaracherla, Morteza Ibrahimi, Xiuyuan Lu, and Benjamin Van Roy.","author":"Osband Ian","year":"2023","unstructured":"Ian Osband, Zheng Wen, Seyed Mohammad Asghari, Vikranth Dwaracherla, Morteza Ibrahimi, Xiuyuan Lu, and Benjamin Van Roy. 2023. Approximate thompson sampling via epistemic neural networks. In Uncertainty in Artificial Intelligence."},{"key":"e_1_3_2_1_37_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In ICML."},{"key":"e_1_3_2_1_38_1","volume-title":"Zelda: Video analytics using vision-language models. arXiv preprint arXiv:2305.03785","author":"Romero Francisco","year":"2023","unstructured":"Francisco Romero, Caleb Winston, Johann Hauswald, Matei Zaharia, and Christos Kozyrakis. 2023. Zelda: Video analytics using vision-language models. arXiv preprint arXiv:2305.03785 (2023)."},{"key":"e_1_3_2_1_39_1","volume-title":"Abbas Kazerouni, Ian Osband, Zheng Wen, et al.","author":"Russo Daniel J","year":"2018","unstructured":"Daniel J Russo, Benjamin Van Roy, Abbas Kazerouni, Ian Osband, Zheng Wen, et al., 2018. A tutorial on thompson sampling. Foundations and Trends\u00ae in Machine Learning (2018)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2023.3278695"},{"key":"e_1_3_2_1_41_1","volume-title":"Moviechat: From dense token to sparse memory for long video understanding. In CVPR.","author":"Song Enxin","year":"2024","unstructured":"Enxin Song, Wenhao Chai, Guanhong Wang, Yucheng Zhang, Haoyang Zhou, Feiyang Wu, Haozhe Chi, Xun Guo, Tian Ye, Yanting Zhang, et al., 2024. Moviechat: From dense token to sparse memory for long video understanding. In CVPR."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","unstructured":"Xi Tang Jihao Qiu Lingxi Xie Yunjie Tian Jianbin Jiao and Qixiang Ye. 2025. Adaptive Keyframe Sampling for Long Video Understanding. In CVPR.","DOI":"10.1109\/CVPR52734.2025.02711"},{"key":"e_1_3_2_1_43_1","volume-title":"Ryan Burnell, Libin Bai, Anmol Gulati, Garrett Tanzer, Damien Vincent, Zhufeng Pan, Shibo Wang, et al.","author":"Team Gemini","year":"2024","unstructured":"Gemini Team, Petko Georgiev, Ving Ian Lei, Ryan Burnell, Libin Bai, Anmol Gulati, Garrett Tanzer, Damien Vincent, Zhufeng Pan, Shibo Wang, et al., 2024. Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context. arXiv preprint arXiv:2403.05530 (2024)."},{"key":"e_1_3_2_1_44_1","unstructured":"Peng Wang Shuai Bai Sinan Tan Shijie Wang Zhihao Fan Jinze Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge et al. 2024. Qwen2-vl: Enhancing vision-language model's perception of the world at any resolution. arXiv preprint arXiv:2409.12191 (2024)."},{"key":"e_1_3_2_1_45_1","volume-title":"VideoRFT: Incentivizing Video Reasoning Capability in MLLMs via Reinforced Fine-Tuning. arXiv preprint arXiv:2505.12434","author":"Wang Qi","year":"2025","unstructured":"Qi Wang, Yanrui Yu, Ye Yuan, Rui Mao, and Tianfei Zhou. 2025b. VideoRFT: Incentivizing Video Reasoning Capability in MLLMs via Reinforced Fine-Tuning. arXiv preprint arXiv:2505.12434 (2025)."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"crossref","unstructured":"Ziyang Wang Shoubin Yu Elias Stengel-Eskin Jaehong Yoon Feng Cheng Gedas Bertasius and Mohit Bansal. 2025a. VideoTree: Adaptive Tree-based Video Representation for LLM Reasoning on Long Videos. In CVPR.","DOI":"10.1109\/CVPR52734.2025.00311"},{"key":"e_1_3_2_1_47_1","unstructured":"Yanchao Xu Dongxiang Zhang Shuhao Zhang Sai Wu Zexu Feng and Gang Chen. 2024. Predictive and Near-Optimal Sampling for View Materialization in Video Databases. In SIGMOD."},{"key":"e_1_3_2_1_48_1","volume-title":"Joy Arulraj, and Umakishore Ramachandran.","author":"Xu Zhuangdi","year":"2022","unstructured":"Zhuangdi Xu, Gaurav Tarlok Kakkar, Joy Arulraj, and Umakishore Ramachandran. 2022. EVA: A symbolic approach to accelerating exploratory video analytics with materialized views. In SIGMOD."},{"key":"e_1_3_2_1_49_1","volume-title":"nsdb: Architecting the next generation database by integrating neural and symbolic systems. VLDB","author":"Yuan Ye","year":"2024","unstructured":"Ye Yuan, Bo Tang, Tianfei Zhou, Zhiwei Zhang, and Jianbin Qin. 2024. nsdb: Architecting the next generation database by integrating neural and symbolic systems. VLDB (2024)."},{"key":"e_1_3_2_1_50_1","first-page":"11975","article-title":"Sigmoid loss for language image pre-training","author":"Zhai Xiaohua","year":"2023","unstructured":"Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, and Lucas Beyer. 2023. Sigmoid loss for language image pre-training. In ICCV. 11975-11986.","journal-title":"ICCV."},{"key":"e_1_3_2_1_51_1","first-page":"604","article-title":"Co-Movement Pattern Mining from Videos","volume":"17","author":"Zhang Dongxiang","year":"2023","unstructured":"Dongxiang Zhang, Teng Ma, Junnan Hu, Yijun Bei, Kian-Lee Tan, and Gang Chen. 2023. Co-Movement Pattern Mining from Videos. VLDB, Vol. 17, 3 (2023), 604-616.","journal-title":"VLDB"},{"key":"e_1_3_2_1_52_1","volume-title":"Long context transfer from language to vision. arXiv preprint arXiv:2406.16852","author":"Zhang Peiyuan","year":"2024","unstructured":"Peiyuan Zhang, Kaichen Zhang, Bo Li, Guangtao Zeng, Jingkang Yang, Yuanhan Zhang, Ziyue Wang, Haoran Tan, Chunyuan Li, and Ziwei Liu. 2024b. Long context transfer from language to vision. arXiv preprint arXiv:2406.16852 (2024)."},{"key":"e_1_3_2_1_53_1","volume-title":"Video instruction tuning with synthetic data. arXiv preprint arXiv:2410.02713","author":"Zhang Yuanhan","year":"2024","unstructured":"Yuanhan Zhang, Jinming Wu, Wei Li, Bo Li, Zejun Ma, Ziwei Liu, and Chunyuan Li. 2024a. Video instruction tuning with synthetic data. arXiv preprint arXiv:2410.02713 (2024)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3583780.3615048"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754955","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:05:49Z","timestamp":1765339549000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754955"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":55,"alternative-id":["10.1145\/3746027.3754955","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754955","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}