{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T16:48:29Z","timestamp":1755794909437,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":57,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,7,20]],"date-time":"2025-07-20T00:00:00Z","timestamp":1752969600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,7,20]]},"DOI":"10.1145\/3690624.3709408","type":"proceedings-article","created":{"date-parts":[[2025,4,4]],"date-time":"2025-04-04T18:42:22Z","timestamp":1743792142000},"page":"2596-2605","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Multi-Branch Collaborative Learning Network for Video Quality Assessment in Industrial Video Search"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-0568-5658","authenticated-orcid":false,"given":"Hengzhu","family":"Tang","sequence":"first","affiliation":[{"name":"Baidu Inc., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-8821-7797","authenticated-orcid":false,"given":"Zefeng","family":"Zhang","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-6554-3673","authenticated-orcid":false,"given":"Zhiping","family":"Li","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5936-6678","authenticated-orcid":false,"given":"Zhenyu","family":"Zhang","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-8658-9742","authenticated-orcid":false,"given":"Xing","family":"Wu","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-9054-3322","authenticated-orcid":false,"given":"Li","family":"Gao","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3622-3399","authenticated-orcid":false,"given":"Suqi","family":"Cheng","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0684-6205","authenticated-orcid":false,"given":"Dawei","family":"Yin","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,7,20]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"e_1_3_2_2_3_1","unstructured":"Tim Brooks Bill Peebles Connor Holmes Will DePue Yufei Guo Li Jing David Schnurr Joe Taylor Troy Luhman Eric Luhman Clarence Ng Ricky Wang and Aditya Ramesh. 2024. Video generation models as world simulators. (2024). https:\/\/openai.com\/research\/video-generation-models-as-world-simulators"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3290605.3300931"},{"key":"e_1_3_2_2_6_1","volume-title":"Objective video quality assessment methods: A classification, review, and performance comparison","author":"Chikkerur Shyamprasad","year":"2011","unstructured":"Shyamprasad Chikkerur, Vijay Sundaram, Martin Reisslein, and Lina J Karam. 2011. Objective video quality assessment methods: A classification, review, and performance comparison. IEEE transactions on broadcasting, Vol. 57, 2 (2011), 165--182."},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3306346.3323028"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00589"},{"key":"e_1_3_2_2_9_1","volume-title":"Ronan Le Bras, and Yejin Choi","author":"Hessel Jack","year":"2021","unstructured":"Jack Hessel, Ari Holtzman, Maxwell Forbes, Ronan Le Bras, and Yejin Choi. 2021. Clipscore: A reference-free evaluation metric for image captioning. arXiv preprint arXiv:2104.08718 (2021)."},{"key":"e_1_3_2_2_10_1","volume-title":"Denoising diffusion probabilistic models. Advances in neural information processing systems","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems, Vol. 33 (2020), 6840--6851."},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/QoMEX.2017.7965673"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00745"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02060"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3290605.3300311"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581494"},{"key":"e_1_3_2_2_16_1","volume-title":"International conference on machine learning. PMLR, 4904--4916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In International conference on machine learning. PMLR, 4904--4916."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/tip.2019.2923051"},{"key":"e_1_3_2_2_18_1","volume-title":"Subjective-Aligned Dateset and Metric for Text-to-Video Quality Assessment. arXiv preprint arXiv:2403.11956","author":"Kou Tengchuan","year":"2024","unstructured":"Tengchuan Kou, Xiaohong Liu, Zicheng Zhang, Chunyi Li, Haoning Wu, Xiongkuo Min, Guangtao Zhai, and Ning Liu. 2024. Subjective-Aligned Dateset and Metric for Text-to-Video Quality Assessment. arXiv preprint arXiv:2403.11956 (2024)."},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/2470654.2481301"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3351028"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-020-01408-w"},{"key":"e_1_3_2_2_22_1","volume-title":"International conference on machine learning. PMLR, 12888--12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888--12900."},{"key":"e_1_3_2_2_23_1","volume-title":"Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021b. Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems, Vol. 34 (2021), 9694--9705."},{"key":"e_1_3_2_2_24_1","volume-title":"Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557","author":"Li Liunian Harold","year":"2019","unstructured":"Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, and Kai-Wei Chang. 2019b. Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557 (2019)."},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581006"},{"key":"e_1_3_2_2_26_1","volume-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems, Vol. 32 (2019)."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00640"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/tip.2015.2502725"},{"key":"e_1_3_2_2_29_1","volume-title":"Clipcap: Clip prefix for image captioning. arXiv preprint arXiv:2111.09734","author":"Mokady Ron","year":"2021","unstructured":"Ron Mokady, Amir Hertz, and Amit H Bermano. 2021. Clipcap: Clip prefix for image captioning. arXiv preprint arXiv:2111.09734 (2021)."},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19772-7_1"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00035"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3379337.3415864"},{"key":"e_1_3_2_2_33_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_2_34_1","volume-title":"International conference on machine learning. Pmlr, 8821--8831","author":"Ramesh Aditya","year":"2021","unstructured":"Aditya Ramesh, Mikhail Pavlov, Gabriel Goh, Scott Gray, Chelsea Voss, Alec Radford, Mark Chen, and Ilya Sutskever. 2021. Zero-shot text-to-image generation. In International conference on machine learning. Pmlr, 8821--8831."},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/tip.2014.2299154"},{"key":"e_1_3_2_2_36_1","volume-title":"Neural machine translation of rare words with subword units. arXiv preprint arXiv:1508.07909","author":"Sennrich Rico","year":"2015","unstructured":"Rico Sennrich, Barry Haddow, and Alexandra Birch. 2015. Neural machine translation of rare words with subword units. arXiv preprint arXiv:1508.07909 (2015)."},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2869673"},{"key":"e_1_3_2_2_38_1","volume-title":"Clip models are few-shot learners: Empirical studies on vqa and visual entailment. arXiv preprint arXiv:2203.07190","author":"Song Haoyu","year":"2022","unstructured":"Haoyu Song, Li Dong, Wei-Nan Zhang, Ting Liu, and Furu Wei. 2022. Clip models are few-shot learners: Empirical studies on vqa and visual entailment. arXiv preprint arXiv:2203.07190 (2022)."},{"key":"e_1_3_2_2_39_1","volume-title":"From show to tell: A survey on deep learning-based image captioning","author":"Stefanini Matteo","year":"2022","unstructured":"Matteo Stefanini, Marcella Cornia, Lorenzo Baraldi, Silvia Cascianelli, Giuseppe Fiameni, and Rita Cucchiara. 2022. From show to tell: A survey on deep learning-based image captioning. IEEE transactions on pattern analysis and machine intelligence, Vol. 45, 1 (2022), 539--559."},{"key":"e_1_3_2_2_40_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/tip.2021.3072221"},{"key":"e_1_3_2_2_42_1","volume-title":"FVD: A new metric for video generation.","author":"Unterthiner Thomas","year":"2019","unstructured":"Thomas Unterthiner, Sjoerd van Steenkiste, Karol Kurach, Rapha\u00ebl Marinier, Marcin Michalski, and Sylvain Gelly. 2019. FVD: A new metric for video generation. (2019)."},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3640543.3645143"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/MMSP.2019.8901772"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/mmsp.2019.8901772"},{"key":"e_1_3_2_2_47_1","volume-title":"DisCoVQA: Temporal Distortion-Content Transformers for Video Quality Assessment. (Jun","author":"Wu Haoning","year":"2022","unstructured":"Haoning Wu, Chaofeng Chen, Liang Liao, Jingwen Hou, Wenxiu Sun, Qiong Yan, and Weisi Lin. 2022a. DisCoVQA: Temporal Distortion-Content Transformers for Video Quality Assessment. (Jun 2022)."},{"key":"e_1_3_2_2_48_1","volume-title":"Disentangling aesthetic and technical effects for video quality assessment of user generated content. arXiv preprint arXiv:2211.04894","author":"Wu Haoning","year":"2022","unstructured":"Haoning Wu, Liang Liao, Chaofeng Chen, Jingwen Hou, Annan Wang, Wenxiu Sun, Qiong Yan, and Weisi Lin. 2022b. Disentangling aesthetic and technical effects for video quality assessment of user generated content. arXiv preprint arXiv:2211.04894, Vol. 2, 5 (2022), 6."},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01843"},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611737"},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2023.101968"},{"key":"e_1_3_2_2_52_1","volume-title":"CLIPVQA: Video Quality Assessment via CLIP. arXiv preprint arXiv:2407.04928","author":"Xing Fengchuang","year":"2024","unstructured":"Fengchuang Xing, Mingjie Li, Yuan-Gen Wang, Guopu Zhu, and Xiaochun Cao. 2024. CLIPVQA: Video Quality Assessment via CLIP. arXiv preprint arXiv:2407.04928 (2024)."},{"key":"e_1_3_2_2_53_1","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 14019--14029","author":"Ying Zhenqiang","year":"2021","unstructured":"Zhenqiang Ying, Maniratnam Mandal, Deepti Ghadiyaram, and Alan Bovik. 2021. Patch-vq:'patching up'the video quality problem. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 14019--14029."},{"key":"e_1_3_2_2_54_1","volume-title":"Patch-VQ: \u201dPatching Up","author":"Ying Zhenqiang","year":"2020","unstructured":"Zhenqiang Ying, Mandal Maniratnam, Deepti Ghadiyaram, and AlanC. Bovik. 2020. Patch-VQ: \u201dPatching Up\u201d the Video Quality Problem. Cornell University - arXiv,Cornell University - arXiv (Nov 2020)."},{"key":"e_1_3_2_2_55_1","volume-title":"MD-VQA: Multi-Dimensional Quality Assessment for UGC Live Videos. (Mar","author":"Zhang Zicheng","year":"2023","unstructured":"Zicheng Zhang, Wei Wu, Wei Sun, Dangyang Tu, Wei Lu, Xiongkuo Min, Ying Chen, and Guangtao Zhai. 2023. MD-VQA: Multi-Dimensional Quality Assessment for UGC Live Videos. (Mar 2023)."},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00137"},{"key":"e_1_3_2_2_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01064"}],"event":{"name":"KDD '25: The 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining","sponsor":["SIGMOD ACM Special Interest Group on Management of Data","SIGKDD ACM Special Interest Group on Knowledge Discovery in Data"],"location":"Toronto ON Canada","acronym":"KDD '25"},"container-title":["Proceedings of the 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining V.1"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3690624.3709408","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3690624.3709408","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,16]],"date-time":"2025-08-16T15:35:09Z","timestamp":1755358509000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3690624.3709408"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,20]]},"references-count":57,"alternative-id":["10.1145\/3690624.3709408","10.1145\/3690624"],"URL":"https:\/\/doi.org\/10.1145\/3690624.3709408","relation":{},"subject":[],"published":{"date-parts":[[2025,7,20]]},"assertion":[{"value":"2025-07-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}