{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T15:59:30Z","timestamp":1776095970616,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T00:00:00Z","timestamp":1733184000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"The National Natural Science Foundation of China","award":["62276268"],"award-info":[{"award-number":["62276268"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,12,3]]},"DOI":"10.1145\/3696409.3700260","type":"proceedings-article","created":{"date-parts":[[2024,12,28]],"date-time":"2024-12-28T09:55:23Z","timestamp":1735379723000},"page":"1-1","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["ViCo: Engaging Video Comment Generation with Human Preference Rewards"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-6559-5620","authenticated-orcid":false,"given":"Yuchong","family":"Sun","sequence":"first","affiliation":[{"name":"Renmin University of China, Beijing, CN"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8857-0953","authenticated-orcid":false,"given":"Bei","family":"Liu","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, CN"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3070-9358","authenticated-orcid":false,"given":"Xu","family":"Chen","sequence":"additional","affiliation":[{"name":"Renmin University of China, Beijing, CN"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6036-9035","authenticated-orcid":false,"given":"Ruihua","family":"Song","sequence":"additional","affiliation":[{"name":"Renmin University of China, Beijing, CN"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1025-2012","authenticated-orcid":false,"given":"Jianlong","family":"Fu","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, CN"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,12,28]]},"reference":[{"key":"e_1_3_3_3_2_2","doi-asserted-by":"publisher","unstructured":"Nayyer Aafaq Ajmal Mian Naveed Akhtar Wei Liu and Mubarak Shah. 2023. Dense Video Captioning With Early Linguistic Information Fusion. IEEE Transactions on Multimedia 25 (2023) 2309\u20132322. 10.1109\/TMM.2022.3146005 https:\/\/dl.acm.org\/doi\/10.1109\/TMM.2022.3146005","DOI":"10.1109\/TMM.2022.3146005"},{"key":"e_1_3_3_3_3_2","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katherine Millican Malcolm Reynolds et\u00a0al. 2022. Flamingo: a visual language model for few-shot learning. Advances in Neural Information Processing Systems 35 (2022) 23716\u201323736."},{"key":"e_1_3_3_3_4_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.618"},{"key":"e_1_3_3_3_5_2","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared\u00a0D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et\u00a0al. 2020. Language models are few-shot learners. Advances in neural information processing systems 33 (2020) 1877\u20131901."},{"key":"e_1_3_3_3_6_2","doi-asserted-by":"publisher","DOI":"10.5555\/2002472.2002497"},{"key":"e_1_3_3_3_7_2","doi-asserted-by":"publisher","unstructured":"Shizhe Chen Qin Jin Jia Chen and Alexander\u00a0G. Hauptmann. 2019. Generating Video Descriptions With Latent Topic Guidance. IEEE Transactions on Multimedia 21 9 (2019) 2407\u20132418. 10.1109\/TMM.2019.2896515","DOI":"10.1109\/TMM.2019.2896515"},{"key":"e_1_3_3_3_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICME55011.2023.00445"},{"key":"e_1_3_3_3_9_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00030"},{"key":"e_1_3_3_3_10_2","first-page":"4171","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers). 4171\u20134186."},{"key":"e_1_3_3_3_11_2","doi-asserted-by":"publisher","unstructured":"Lianli Gao Zhao Guo Hanwang Zhang Xing Xu and Heng\u00a0Tao Shen. 2017. Video Captioning With Attention-Based LSTM and Semantic Consistency. IEEE Transactions on Multimedia 19 9 (2017) 2045\u20132055. 10.1109\/TMM.2017.2729019 https:\/\/dl.acm.org\/doi\/10.1109\/TMM.2017.2729019","DOI":"10.1109\/TMM.2017.2729019"},{"key":"e_1_3_3_3_12_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_36"},{"key":"e_1_3_3_3_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_3_3_14_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N16-1147"},{"key":"e_1_3_3_3_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"e_1_3_3_3_16_2","volume-title":"International Conference on Machine Learning","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International Conference on Machine Learning."},{"key":"e_1_3_3_3_17_2","volume-title":"International Conference on Machine Learning","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven C.\u00a0H. Hoi. 2022. BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. In International Conference on Machine Learning."},{"key":"e_1_3_3_3_18_2","doi-asserted-by":"crossref","unstructured":"Junnan Li Yongkang Wong Qi Zhao and Mohan\u00a0S Kankanhalli. 2019. Video storytelling: Textual summaries for events. IEEE Transactions on Multimedia 22 2 (2019) 554\u2013565.","DOI":"10.1109\/TMM.2019.2930041"},{"key":"e_1_3_3_3_19_2","first-page":"74","volume-title":"Text summarization branches out","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74\u201381."},{"key":"e_1_3_3_3_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01742"},{"key":"e_1_3_3_3_21_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33016810"},{"key":"e_1_3_3_3_22_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.495"},{"key":"e_1_3_3_3_23_2","unstructured":"Ron Mokady Amir Hertz and Amit\u00a0H Bermano. 2021. Clipcap: Clip prefix for image captioning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2111.09734 (2021)."},{"key":"e_1_3_3_3_24_2","unstructured":"OpenAI. 2022. Introducing ChatGPT. (2022). https:\/\/openai.com\/blog\/chatgpt"},{"key":"e_1_3_3_3_25_2","unstructured":"Long Ouyang Jeffrey Wu Xu Jiang Diogo Almeida Carroll Wainwright Pamela Mishkin Chong Zhang Sandhini Agarwal Katarina Slama Alex Ray et\u00a0al. 2022. Training language models to follow instructions with human feedback. Advances in Neural Information Processing Systems 35 (2022) 27730\u201327744."},{"key":"e_1_3_3_3_26_2","first-page":"311","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311\u2013318. https:\/\/dl.acm.org\/doi\/10.3115\/1073083.1073135"},{"key":"e_1_3_3_3_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00854"},{"key":"e_1_3_3_3_28_2","first-page":"8748","volume-title":"International Conference on Machine Learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In International Conference on Machine Learning. PMLR, 8748\u20138763."},{"key":"e_1_3_3_3_29_2","unstructured":"Alec Radford Jeff Wu Rewon Child David Luan Dario Amodei and Ilya Sutskever. 2019. Language Models are Unsupervised Multitask Learners. (2019)."},{"key":"e_1_3_3_3_30_2","unstructured":"John Schulman Filip Wolski Prafulla Dhariwal Alec Radford and Oleg Klimov. 2017. Proximal Policy Optimization Algorithms. ArXiv abs\/1707.06347 (2017)."},{"key":"e_1_3_3_3_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01743"},{"key":"e_1_3_3_3_32_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01280"},{"key":"e_1_3_3_3_33_2","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3479207"},{"key":"e_1_3_3_3_34_2","unstructured":"Maria Tsimpoukelli Jacob\u00a0L Menick Serkan Cabi SM Eslami Oriol Vinyals and Felix Hill. 2021. Multimodal few-shot learning with frozen language models. Advances in Neural Information Processing Systems 34 (2021) 200\u2013212."},{"key":"e_1_3_3_3_35_2","unstructured":"Jianfeng Wang Zhengyuan Yang Xiaowei Hu Linjie Li Kevin Lin Zhe Gan Zicheng Liu Ce Liu and Lijuan Wang. 2022. GIT: A Generative Image-to-text Transformer for Vision and Language. Transactions on Machine Learning Research (2022)."},{"key":"e_1_3_3_3_36_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00432"},{"key":"e_1_3_3_3_37_2","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413890"},{"key":"e_1_3_3_3_38_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"e_1_3_3_3_39_2","doi-asserted-by":"publisher","unstructured":"Wanru Xu Jian Yu Zhenjiang Miao Lili Wan Yi Tian and Qiang Ji. 2021. Deep Reinforcement Polishing Network for Video Captioning. IEEE Transactions on Multimedia 23 (2021) 1772\u20131784. 10.1109\/TMM.2020.3002669 https:\/\/dl.acm.org\/doi\/10.1109\/TMM.2020.3002669","DOI":"10.1109\/TMM.2020.3002669"},{"key":"e_1_3_3_3_40_2","doi-asserted-by":"publisher","unstructured":"Chenggang Yan Yunbin Tu Xingzheng Wang Yongbing Zhang Xinhong Hao Yongdong Zhang and Qionghai Dai. 2020. STAT: Spatial-Temporal Attention Mechanism for Video Captioning. IEEE Transactions on Multimedia 22 1 (2020) 229\u2013241. 10.1109\/TMM.2019.2924576 https:\/\/dl.acm.org\/doi\/10.1109\/TMM.2019.2924576","DOI":"10.1109\/TMM.2019.2924576"},{"key":"e_1_3_3_3_41_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-75765-6_55"},{"key":"e_1_3_3_3_42_2","doi-asserted-by":"publisher","unstructured":"Zhiwang Zhang Dong Xu Wanli Ouyang and Luping Zhou. 2021. Dense Video Captioning Using Graph-Based Sentence Summarization. IEEE Transactions on Multimedia 23 (2021) 1799\u20131810. 10.1109\/TMM.2020.3003592 https:\/\/dl.acm.org\/doi\/10.1109\/TMM.2020.3003592","DOI":"10.1109\/TMM.2020.3003592"},{"key":"e_1_3_3_3_43_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-13-2122-1"}],"event":{"name":"MMAsia '24: ACM Multimedia Asia","location":"Auckland New Zealand","acronym":"MMAsia '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 6th ACM International Conference on Multimedia in Asia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696409.3700260","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3696409.3700260","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:10:16Z","timestamp":1750295416000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696409.3700260"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,3]]},"references-count":42,"alternative-id":["10.1145\/3696409.3700260","10.1145\/3696409"],"URL":"https:\/\/doi.org\/10.1145\/3696409.3700260","relation":{},"subject":[],"published":{"date-parts":[[2024,12,3]]},"assertion":[{"value":"2024-12-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}