{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T16:48:47Z","timestamp":1755794927819,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":28,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,8,3]]},"DOI":"10.1145\/3711896.3737203","type":"proceedings-article","created":{"date-parts":[[2025,8,3]],"date-time":"2025-08-03T21:04:26Z","timestamp":1754255066000},"page":"4387-4395","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["<i>COEF-VQ:<\/i>\n            Cost-Efficient Video Quality Understanding through a Cascaded Multimodal LLM Framework"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-8724-7784","authenticated-orcid":false,"given":"Xin","family":"Dong","sequence":"first","affiliation":[{"name":"ByteDance Inc., San Jose, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-7826-9324","authenticated-orcid":false,"given":"Sen","family":"Jia","sequence":"additional","affiliation":[{"name":"ByteDance Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-6534-7394","authenticated-orcid":false,"given":"Ming Rui","family":"Wang","sequence":"additional","affiliation":[{"name":"ByteDance Inc., Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1882-3331","authenticated-orcid":false,"given":"Yan","family":"Li","sequence":"additional","affiliation":[{"name":"ByteDance Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0303-5885","authenticated-orcid":false,"given":"Zhenheng","family":"Yang","sequence":"additional","affiliation":[{"name":"ByteDance Inc., San Jose, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-8067-4602","authenticated-orcid":false,"given":"Bingfeng","family":"Deng","sequence":"additional","affiliation":[{"name":"ByteDance Inc., Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6699-5192","authenticated-orcid":false,"given":"Hongyu","family":"Xiong","sequence":"additional","affiliation":[{"name":"ByteDance Inc., San Jose, CA, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,8,3]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_2_2_1","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang et al. 2025. Qwen2. 5-vl technical report. arXiv preprint arXiv:2502.13923 (2025)."},{"key":"e_1_3_2_2_3_1","volume-title":"Unsupervised cross-lingual representation learning at scale. arXiv preprint arXiv:1911.02116","author":"Conneau A","year":"2019","unstructured":"A Conneau. 2019. Unsupervised cross-lingual representation learning at scale. arXiv preprint arXiv:1911.02116 (2019)."},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_2_5_1","volume-title":"Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685","author":"Hu Edward J","year":"2021","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2021. Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)."},{"key":"e_1_3_2_2_6_1","volume-title":"Llava-onevision: Easy visual task transfer. arXiv preprint arXiv:2408.03326","author":"Li Bo","year":"2024","unstructured":"Bo Li, Yuanhan Zhang, Dong Guo, Renrui Zhang, Feng Li, Hao Zhang, Kaichen Zhang, Yanwei Li, Ziwei Liu, and Chunyuan Li. 2024. Llava-onevision: Easy visual task transfer. arXiv preprint arXiv:2408.03326 (2024)."},{"key":"e_1_3_2_2_7_1","volume-title":"Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems 34","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems 34 (2021), 9694-9705."},{"key":"e_1_3_2_2_8_1","volume-title":"Supervision exists everywhere: A data efficient contrastive language-image pre-training paradigm. arXiv preprint arXiv:2110.05208","author":"Li Yangguang","year":"2021","unstructured":"Yangguang Li, Feng Liang, Lichen Zhao, Yufeng Cui, Wanli Ouyang, Jing Shao, Fengwei Yu, and Junjie Yan. 2021. Supervision exists everywhere: A data efficient contrastive language-image pre-training paradigm. arXiv preprint arXiv:2110.05208 (2021)."},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02520"},{"key":"e_1_3_2_2_10_1","volume-title":"CAT: Enhancing Multimodal Large Language Model to Answer Questions in Dynamic Audio-Visual Scenarios. https:\/\/link.springer.com\/chapter\/10.1007\/978-3-031-72684-2_9","author":"Liu Jian","year":"2023","unstructured":"Jian Liu, Yu Lee, and Ming Zhang. 2023. CAT: Enhancing Multimodal Large Language Model to Answer Questions in Dynamic Audio-Visual Scenarios. https:\/\/link.springer.com\/chapter\/10.1007\/978-3-031-72684-2_9"},{"key":"e_1_3_2_2_11_1","unstructured":"Ming Liu Qi Zhang and Jian Li. 2023. Macaw-LLM: Multi-Modal Language Modeling with Image Audio Video and Text Integration. https:\/\/arxiv.org\/abs\/2306.09093"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_2_13_1","volume-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems 32","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_2_14_1","volume-title":"Ovis: Structural embedding alignment for multimodal large language model. arXiv preprint arXiv:2405.20797","author":"Lu Shiyin","year":"2024","unstructured":"Shiyin Lu, Yang Li, Qing-Guo Chen, Zhao Xu, Weihua Luo, Kaifu Zhang, and Han-Jia Ye. 2024. Ovis: Structural embedding alignment for multimodal large language model. arXiv preprint arXiv:2405.20797 (2024)."},{"key":"e_1_3_2_2_15_1","volume-title":"International conference on machine learning. PMLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748-8763."},{"key":"e_1_3_2_2_16_1","volume-title":"International conference on machine learning. PMLR, 28492-28518","author":"Radford Alec","year":"2023","unstructured":"Alec Radford, JongWook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever. 2023. Robust speech recognition via large-scale weak supervision. In International conference on machine learning. PMLR, 28492-28518."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"key":"e_1_3_2_2_18_1","volume-title":"Garnett (Eds.)","volume":"31","author":"Sensoy Murat","year":"2018","unstructured":"Murat Sensoy, Lance Kaplan, and Melih Kandemir. 2018. Evidential Deep Learning to Quantify Classification Uncertainty. In Advances in Neural Information Processing Systems, S. Bengio, H. Wallach, H. Larochelle, K. Grauman, N. Cesa-Bianchi, and R. Garnett (Eds.), Vol. 31. Curran Associates, Inc. https:\/\/proceedings.neurips. cc\/paper_files\/paper\/2018\/file\/a981f2b708044d6fb4a71a1463242520-Paper.pdf"},{"key":"e_1_3_2_2_19_1","volume-title":"Audio-Enhanced Vision-Language Modeling with Latent Space Broadening for High Quality Data Expansion. arXiv preprint arXiv:2503.17551","author":"Sun Yu","year":"2025","unstructured":"Yu Sun, Yin Li, Ruixiao Sun, Chunhui Liu, Fangming Zhou, Ze Jin, Linjie Wang, Xiang Shen, Zhuolin Hao, and Hongyu Xiong. 2025. Audio-Enhanced Vision-Language Modeling with Latent Space Broadening for High Quality Data Expansion. arXiv preprint arXiv:2503.17551 (2025)."},{"key":"e_1_3_2_2_20_1","unstructured":"Gemini Team Rohan Anil Sebastian Borgeaud Jean-Baptiste Alayrac Jiahui Yu Radu Soricut Johan Schalkwyk Andrew M Dai Anja Hauth Katie Millican et al. 2023. Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)."},{"key":"e_1_3_2_2_21_1","unstructured":"Peng Wang Shuai Bai Sinan Tan Shijie Wang Zhihao Fan Jinze Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge et al. 2024. Qwen2-vl: Enhancing vision-language model's perception of the world at any resolution. arXiv preprint arXiv:2409.12191 (2024)."},{"key":"e_1_3_2_2_22_1","volume-title":"Saksham Singhal, Subhojit Som, et al.","author":"Wang Wenhui","year":"2022","unstructured":"Wenhui Wang, Hangbo Bao, Li Dong, Johan Bjorck, Zhiliang Peng, Qiang Liu, Kriti Aggarwal, Owais Khan Mohammed, Saksham Singhal, Subhojit Som, et al. 2022. Image as a foreign language: Beit pretraining for all vision and visionlanguage tasks. arXiv preprint arXiv:2208.10442 (2022)."},{"key":"e_1_3_2_2_23_1","volume-title":"Florence: A new foundation model for computer vision. arXiv preprint arXiv:2111.11432","author":"Yuan Lu","year":"2021","unstructured":"Lu Yuan, Dongdong Chen, Yi-Ling Chen, Noel Codella, Xiyang Dai, Jianfeng Gao, Houdong Hu, Xuedong Huang, Boxin Li, Chunyuan Li, et al. 2021. Florence: A new foundation model for computer vision. arXiv preprint arXiv:2111.11432 (2021)."},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"e_1_3_2_2_25_1","volume-title":"Video-llama: An instructiontuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858","author":"Zhang Hang","year":"2023","unstructured":"Hang Zhang, Xin Li, and Lidong Bing. 2023. Video-llama: An instructiontuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858 (2023)."},{"key":"e_1_3_2_2_26_1","unstructured":"Pan Zhang Xiaoyi Dong Yuhang Zang Yuhang Cao Rui Qian Lin Chen Qipeng Guo Haodong Duan Bin Wang Linke Ouyang et al. 2024. Internlm-xcomposer- 2.5: A versatile large vision language model supporting long-contextual input and output. arXiv preprint arXiv:2407.03320 (2024)."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"e_1_3_2_2_28_1","volume-title":"MEERKAT: Audio-Visual Large Language Model for Grounding in Space and Time. https:\/\/link.springer.com\/chapter\/10.1007\/978-3-031-73039-9_4","author":"Zhu Qiang","year":"2024","unstructured":"Qiang Zhu, Qi Zhang, and Zhen Lee. 2024. MEERKAT: Audio-Visual Large Language Model for Grounding in Space and Time. https:\/\/link.springer.com\/chapter\/10.1007\/978-3-031-73039-9_4"}],"event":{"name":"KDD '25: The 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining","sponsor":["SIGMOD ACM Special Interest Group on Management of Data","SIGKDD ACM Special Interest Group on Knowledge Discovery in Data"],"location":"Toronto ON Canada","acronym":"KDD '25"},"container-title":["Proceedings of the 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining V.2"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3711896.3737203","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,16]],"date-time":"2025-08-16T14:43:47Z","timestamp":1755355427000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3711896.3737203"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,3]]},"references-count":28,"alternative-id":["10.1145\/3711896.3737203","10.1145\/3711896"],"URL":"https:\/\/doi.org\/10.1145\/3711896.3737203","relation":{},"subject":[],"published":{"date-parts":[[2025,8,3]]},"assertion":[{"value":"2025-08-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}