{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,7]],"date-time":"2026-05-07T16:24:06Z","timestamp":1778171046257,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":73,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-sa\/4.0\/"}],"funder":[{"name":"RIE2020 Industry Alignment Fund - Industry Collaboration Projects (IAF-ICP)"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3611737","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:40Z","timestamp":1698391660000},"page":"1045-1054","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":48,"title":["Towards Explainable In-the-Wild Video Quality Assessment: A Database and a Language-Prompted Approach"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8642-8101","authenticated-orcid":false,"given":"Haoning","family":"Wu","sequence":"first","affiliation":[{"name":"Nanyang Technological University, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-0339-6166","authenticated-orcid":false,"given":"Erli","family":"Zhang","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2238-2420","authenticated-orcid":false,"given":"Liang","family":"Liao","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6137-5162","authenticated-orcid":false,"given":"Chaofeng","family":"Chen","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6397-0114","authenticated-orcid":false,"given":"Jingwen","family":"Hou","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-2998-9817","authenticated-orcid":false,"given":"Annan","family":"Wang","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5026-8820","authenticated-orcid":false,"given":"Wenxiu","family":"Sun","sequence":"additional","affiliation":[{"name":"Sensetime Group Limited, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-2942-267X","authenticated-orcid":false,"given":"Qiong","family":"Yan","sequence":"additional","affiliation":[{"name":"Sensetime Research, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9866-1947","authenticated-orcid":false,"given":"Weisi","family":"Lin","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Singapore, Singapore"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Methodology for the subjective assessment of the quality of television pictures. ITU-R Rec. BT.500","author":"Recommendation 0","year":"2000","unstructured":"Recommendation 500-10: Methodology for the subjective assessment of the quality of television pictures. ITU-R Rec. BT.500, 2000."},{"key":"e_1_3_2_1_2_1","volume-title":"Thirty-sixth Conference on Neural Information Processing Systems Datasets and Benchmarks Track","author":"Antsiferova A.","year":"2022","unstructured":"Antsiferova, A., Lavrushkin, S., Smirnov, M., Gushchin, A., Vatolin, D. S., and Kulikov, D. Video compression dataset and benchmark of learning-based video-quality metrics. In Thirty-sixth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (2022)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00491"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3088505"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413717"},{"key":"e_1_3_2_1_6_1","volume-title":"Light-vqa: A multi-dimensional quality assessment model for low-light video enhancement. In n Proceedings of the 31st ACM International Conference on Multimedia","author":"Dong Y.","year":"2023","unstructured":"Dong, Y., Liu, X., Gao, Y., Zhou, X., Tan, T., and Zhai, G. Light-vqa: A multi-dimensional quality assessment model for low-light video enhancement. In n Proceedings of the 31st ACM International Conference on Multimedia (2023)."},{"key":"e_1_3_2_1_7_1","volume-title":"An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929","author":"Dosovitskiy A.","year":"2020","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., et al. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_8_1","unstructured":"Fang Y. Zhu H. Zeng Y. Ma K. and Wang Z. Perceptual quality assessment of smartphone photography. In CVPR."},{"key":"e_1_3_2_1_9_1","volume-title":"Clip-adapter: Better vision-language models with feature adapters. arXiv preprint arXiv:2110.04544","author":"Gao P.","year":"2021","unstructured":"Gao, P., Geng, S., Zhang, R., Ma, T., Fang, R., Zhang, Y., Li, H., and Qiao, Y. Clip-adapter: Better vision-language models with feature adapters. arXiv preprint arXiv:2110.04544 (2021)."},{"key":"e_1_3_2_1_10_1","first-page":"9","article-title":"-C. In-capture mobile video distortions: A study of subjective behavior and objective algorithms","volume":"28","author":"Ghadiyaram D.","year":"2018","unstructured":"Ghadiyaram, D., Pan, J., Bovik, A. C., Moorthy, A. K., Panda, P., and Yang, K.-C. In-capture mobile video distortions: A study of subjective behavior and objective algorithms. IEEE TCSVT 28, 9 (2018), 2061--2077.","journal-title":"IEEE TCSVT"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/QoMEX.2017.7965673"},{"key":"e_1_3_2_1_13_1","first-page":"4041","article-title":"Koniq-10k: An ecologically valid database for deep learning of blind image quality assessment","volume":"29","author":"Hosu V.","year":"2020","unstructured":"Hosu, V., Lin, H., Sziranyi, T., and Saupe, D. Koniq-10k: An ecologically valid database for deep learning of blind image quality assessment. IEEE TIP 29 (2020), 4041--4056.","journal-title":"IEEE TIP"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3186307"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3308852"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413695"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19781-9_36"},{"key":"e_1_3_2_1_18_1","volume-title":"July","author":"Ilharco G.","year":"2021","unstructured":"Ilharco, G., Wortsman, M., Wightman, R., Gordon, C., Carlini, N., Taori, R., Dave, A., Shankar, V., Namkoong, H., Miller, J., Hajishirzi, H., Farhadi, A., and Schmidt, L. OpenCLIP, July 2021."},{"key":"e_1_3_2_1_19_1","volume-title":"ICML","author":"Jia C.","year":"2021","unstructured":"Jia, C., Yang, Y., Xia, Y., Chen, Y.-T., Parekh, Z., Pham, H., Le, Q. V., Sung, Y., Li, Z., and Duerig, T. Scaling up visual and vision-language representation learning with noisy text supervision. In ICML (2021)."},{"key":"e_1_3_2_1_20_1","volume-title":"The kinetics human action video dataset. ArXiv abs\/1705.06950","author":"Kay W.","year":"2017","unstructured":"Kay, W., Carreira, J., Simonyan, K., Zhang, B., Hillier, C., Vijayanarasimhan, S., Viola, F., Green, T., Back, T., Natsev, A., Suleyman, M., and Zisserman, A. The kinetics human action video dataset. ArXiv abs\/1705.06950 (2017)."},{"key":"e_1_3_2_1_21_1","volume-title":"Vila: Learning image aesthetics from user comments with vision-language pretraining","author":"Ke J.","year":"2023","unstructured":"Ke, J., Ye, K., Yu, J., Wu, Y., Milanfar, P., and Yang, F. Vila: Learning image aesthetics from user comments with vision-language pretraining, 2023."},{"key":"e_1_3_2_1_22_1","first-page":"12","article-title":"Two-level approach for no-reference consumer video quality assessment","volume":"28","author":"Korhonen J","year":"2019","unstructured":"Korhonen, J. Two-level approach for no-reference consumer video quality assessment. IEEE TIP 28, 12 (2019), 5923--5938.","journal-title":"IEEE TIP"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413845"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611860"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01048"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3164467"},{"key":"e_1_3_2_1_27_1","volume-title":"Agiqa-3k: An open database for ai-generated image quality assessment","author":"Li C.","year":"2023","unstructured":"Li, C., Zhang, Z., Wu, H., Sun, W., Min, X., Liu, X., Zhai, G., and Lin, W. Agiqa-3k: An open database for ai-generated image quality assessment, 2023."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3351028"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-020-01408-w"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547849"},{"key":"e_1_3_2_1_31_1","volume-title":"J., and Bagdanov, A. D. Exploiting unlabeled data in cnns by self-supervised learning to rank","author":"Liu X.","year":"2019","unstructured":"Liu, X., Van De Weijer, J., and Bagdanov, A. D. Exploiting unlabeled data in cnns by self-supervised learning to rank. IEEE TPAMI (2019), 1--1."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2012.2227726"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.5555\/2354409.2354807"},{"key":"e_1_3_2_1_35_1","volume-title":"Expanding language-image pretrained models for general video recognition. ECCV","author":"Ni B.","year":"2022","unstructured":"Ni, B., Peng, H., Chen, M., Zhang, S., Meng, G., Fu, J., Xiang, S., and Ling, H. Expanding language-image pretrained models for general video recognition. ECCV (2022)."},{"key":"e_1_3_2_1_36_1","first-page":"7","article-title":"Cvd2014-a database for evaluating no-reference video quality assessment algorithms","volume":"25","author":"Nuutinen M.","year":"2016","unstructured":"Nuutinen, M., Virtanen, T., Vaahteranoksa, M., Vuori, T., Oittinen, P., and H\u00e4kkinen, J. Cvd2014-a database for evaluating no-reference video quality assessment algorithms. IEEE TIP 25, 7 (2016).","journal-title":"IEEE TIP"},{"key":"e_1_3_2_1_37_1","volume-title":"Down-scaling with learned kernels in multiscale deep neural networks for non-uniform single image deblurring. arXiv preprint arXiv:1903.10157","author":"Park D.","year":"2019","unstructured":"Park, D., Kim, J., and Chun, S. Y. Down-scaling with learned kernels in multiscale deep neural networks for non-uniform single image deblurring. arXiv preprint arXiv:1903.10157 (2019)."},{"key":"e_1_3_2_1_38_1","volume-title":"Learning transferable visual models from natural language supervision","author":"Radford A.","year":"2021","unstructured":"Radford, A., Kim, J. W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., Krueger, G., and Sutskever, I. Learning transferable visual models from natural language supervision, 2021."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01755"},{"key":"e_1_3_2_1_40_1","first-page":"6","article-title":"Study of subjective and objective quality assessment of video","volume":"19","author":"Seshadrinathan K.","year":"2010","unstructured":"Seshadrinathan, K., Soundararajan, R., Bovik, A. C., and Cormack, L. K. Study of subjective and objective quality assessment of video. IEEE TIP 19, 6 (2010), 1427--1441.","journal-title":"IEEE TIP"},{"key":"e_1_3_2_1_41_1","first-page":"2","article-title":"Large-scale study of perceptual video quality","volume":"28","author":"Sinno Z.","year":"2019","unstructured":"Sinno, Z., and Bovik, A. C. Large-scale study of perceptual video quality. IEEE TIP 28, 2 (2019), 612--627.","journal-title":"IEEE TIP"},{"key":"e_1_3_2_1_42_1","volume-title":"A deep learning based no-reference quality assessment model for ugc videos. arXiv preprint arXiv:2204.14047","author":"Sun W.","year":"2022","unstructured":"Sun, W., Min, X., Lu, W., and Zhai, G. A deep learning based no-reference quality assessment model for ugc videos. arXiv preprint arXiv:2204.14047 (2022)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00143"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/2812802"},{"key":"e_1_3_2_1_45_1","first-page":"4449","article-title":"Ugc-vqa: Benchmarking blind video quality assessment for user generated content","volume":"30","author":"Tu Z.","year":"2021","unstructured":"Tu, Z., Wang, Y., Birkbeck, N., Adsumilli, B., and Bovik, A. C. Ugc-vqa: Benchmarking blind video quality assessment for user generated content. IEEE TIP 30 (2021), 4449--4464.","journal-title":"IEEE TIP"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/OJSP.2021.3090333"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2017.2699859"},{"key":"e_1_3_2_1_48_1","first-page":"23","article-title":"Vis3: an algorithm for video quality assessment via analysis of spatial and spatiotemporal slices","author":"Vu P. V.","year":"2014","unstructured":"Vu, P. V., and Chandler, D. M. Vis3: an algorithm for video quality assessment via analysis of spatial and spatiotemporal slices. Journal of Electronic Imaging 23 (2014).","journal-title":"Journal of Electronic Imaging"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/103085.103089"},{"key":"e_1_3_2_1_50_1","unstructured":"Wang H. Li G. Liu S. and Kuo C.-C. J. Icme 2021 ugc-vqa challenge."},{"key":"e_1_3_2_1_51_1","volume-title":"Exploring clip for assessing the look and feel of images","author":"Wang J.","year":"2022","unstructured":"Wang, J., Chan, K. C. K., and Loy, C. C. Exploring clip for assessing the look and feel of images, 2022."},{"key":"e_1_3_2_1_52_1","volume-title":"Youtube ugc dataset for video compression research. In 2019 MMSP","author":"Wang Y.","year":"2019","unstructured":"Wang, Y., Inguva, S., and Adsumilli, B. Youtube ugc dataset for video compression research. In 2019 MMSP (2019)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01323"},{"key":"e_1_3_2_1_54_1","volume-title":"ICLR","author":"Wang Z.","year":"2022","unstructured":"Wang, Z., Yu, J., Yu, A. W., Dai, Z., Tsvetkov, Y., and Cao, Y. Simvlm: Simple visual language model pretraining with weak supervision. In ICLR (2022)."},{"key":"e_1_3_2_1_55_1","unstructured":"Wiegand T. Draft itu-t recommendation and final draft international standard of joint video specification."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20068-7_31"},{"key":"e_1_3_2_1_57_1","unstructured":"Wu H. Chen C. Liao L. Hou J. Sun W. Yan Q. Gu J. and Lin W. Neighbourhood representative sampling for efficient end-to-end video quality assessment."},{"key":"e_1_3_2_1_58_1","unstructured":"Wu H. Chen C. Liao L. Hou J. Sun W. Yan Q. and Lin W. Discovqa: Temporal distortion-content transformers for video quality assessment."},{"key":"e_1_3_2_1_59_1","unstructured":"Wu H. Liao L. Chen C. Hou J. Wang A. Sun W. Yan Q. and Lin W. Disentangling aesthetic and technical effects for video quality assessment of user generated content."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICME55011.2023.00070"},{"key":"e_1_3_2_1_61_1","volume-title":"Towards robust text-prompted semantic criterion for in-the-wild video quality assessment","author":"Wu H.","year":"2023","unstructured":"Wu, H., Liao, L., Wang, A., Chen, C., Hou, J. H., Zhang, E., Sun, W. S., Yan, Q., and Lin, W. Towards robust text-prompted semantic criterion for in-the-wild video quality assessment, 2023."},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475486"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01924"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01380"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00363"},{"key":"e_1_3_2_1_66_1","unstructured":"Yu J. Wang Z. Vasudevan V. Yeung L. Seyedhosseini M. and Wu Y. Coca: Contrastive captioners are image-text foundation models."},{"key":"e_1_3_2_1_67_1","volume-title":"Image composition assessment with saliency-augmented multi-pattern pooling. arXiv preprint arXiv:2104.03133","author":"Zhang B.","year":"2021","unstructured":"Zhang, B., Niu, L., and Zhang, L. Image composition assessment with saliency-augmented multi-pattern pooling. arXiv preprint arXiv:2104.03133 (2021)."},{"key":"e_1_3_2_1_68_1","first-page":"1","article-title":"Blind image quality assessment using a deep bilinear convolutional neural network","volume":"30","author":"Zhang W.","year":"2020","unstructured":"Zhang, W., Ma, K., Yan, J., Deng, D., and Wang, Z. Blind image quality assessment using a deep bilinear convolutional neural network. IEEE TCSVT 30, 1 (2020), 36--47.","journal-title":"IEEE TCSVT"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01352"},{"key":"e_1_3_2_1_70_1","volume-title":"Advancing zero-shot digital human quality assessment through text-prompted evaluation","author":"Zhang Z.","year":"2023","unstructured":"Zhang, Z., Sun, W., Zhou, Y., Wu, H., Li, C., Min, X., and Liu, X. Advancing zero-shot digital human quality assessment through text-prompted evaluation, 2023."},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00174"},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01415"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611737","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3611737","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:10:42Z","timestamp":1755821442000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611737"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":73,"alternative-id":["10.1145\/3581783.3611737","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3611737","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}