{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,17]],"date-time":"2026-04-17T16:20:10Z","timestamp":1776442810673,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":52,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Shenzhen Science and Technology Program","award":["JCYJ20220818103006012, JCYJ20210324115604012, ZDSYS20211021111415025, ZDSYS20220606100601002"],"award-info":[{"award-number":["JCYJ20220818103006012, JCYJ20210324115604012, ZDSYS20211021111415025, ZDSYS20220606100601002"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3611878","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:12Z","timestamp":1698391632000},"page":"6441-6450","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["A Reinforcement Learning-Based Automatic Video Editing Method Using Pre-trained Vision-Language Model"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6183-6598","authenticated-orcid":false,"given":"Panwen","family":"Hu","sequence":"first","affiliation":[{"name":"The Chinese University of Hong Kong, Shenzhen, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-0886-9982","authenticated-orcid":false,"given":"Nan","family":"Xiao","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Shenzhen, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-8098-0265","authenticated-orcid":false,"given":"Feifei","family":"Li","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Shenzhen, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2493-6033","authenticated-orcid":false,"given":"Yongquan","family":"Chen","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Shenzhen, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7950-1662","authenticated-orcid":false,"given":"Rui","family":"Huang","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Shenzhen, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/IROS51168.2021.9636788"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/2601097.2601198"},{"key":"e_1_3_2_1_3_1","volume-title":"Tel Aviv","author":"Argaw Dawit Mureja","year":"2022","unstructured":"Dawit Mureja Argaw, Fabian Caba Heilbron, Joon-Young Lee, Markus Woodson, and In So Kweon. 2022. The anatomy of video editing: A dataset and benchmark suite for ai-assisted video editing. In Computer Vision-ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23-27, 2022, Proceedings, Part VIII. Springer, 201--218."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISM.2006.37"},{"key":"e_1_3_2_1_5_1","volume-title":"ViComp: composition of user-generated videos. Multimedia tools and applications 75, 12","author":"Bano Sophia","year":"2016","unstructured":"Sophia Bano and Andrea Cavallaro. 2016. ViComp: composition of user-generated videos. Multimedia tools and applications 75, 12 (2016), 7187--7210."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1002\/rob.21931"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2013.6607445"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.507"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2018.00053"},{"key":"e_1_3_2_1_10_1","volume-title":"Self-supervised and Weakly Supervised Contrastive Learning for Frame-wise Action Representations. arXiv preprint arXiv:2212.03125","author":"Chen Minghao","year":"2022","unstructured":"Minghao Chen, Renbo Tu, Chenxi Huang, Yuqi Lin, BoxiWu, and Deng Cai. 2022. Self-supervised and Weakly Supervised Contrastive Learning for Frame-wise Action Representations. arXiv preprint arXiv:2212.03125 (2022)."},{"key":"e_1_3_2_1_11_1","volume-title":"Histograms of oriented gradients for human detection. In 2005 IEEE computer society conference on computer vision and pattern recognition (CVPR'05)","author":"Dalal Navneet","unstructured":"Navneet Dalal and Bill Triggs. 2005. Histograms of oriented gradients for human detection. In 2005 IEEE computer society conference on computer vision and pattern recognition (CVPR'05), Vol. 1. Ieee, 886--893."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVMP.2011.8"},{"key":"e_1_3_2_1_13_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_14_1","volume-title":"The psychology of film: Perceiving beyond the cut. Psychological research 71, 4","author":"Germeys Filip","year":"2007","unstructured":"Filip Germeys and G\u00e9ry d'Ydewalle. 2007. The psychology of film: Perceiving beyond the cut. Psychological research 71, 4 (2007), 458--466."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1525\/fq.2013.66.4.10"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/IROS40897.2019.8967592"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/1198302.1198306"},{"key":"e_1_3_2_1_18_1","volume-title":"Reinforcement Learning Based Automatic Personal Mashup Generation. In 2021 IEEE International Conference on Multimedia and Expo (ICME). IEEE, 1--6.","author":"Hu Panwen","year":"2021","unstructured":"Panwen Hu, Jiazhen Liu, Tianyu Cao, and Rui Huang. 2021. Reinforcement Learning Based Automatic Personal Mashup Generation. In 2021 IEEE International Conference on Multimedia and Expo (ICME). IEEE, 1--6."},{"key":"e_1_3_2_1_19_1","first-page":"5335","article-title":"One-Shot Imitation Drone Filming of Human Motion Videos","volume":"44","author":"Huang Chong","year":"2021","unstructured":"Chong Huang, Yuanjie Dang, Peng Chen, Xin Yang, and Kwang-Ting Cheng. 2021. One-Shot Imitation Drone Filming of Human Motion Videos. IEEE Transactions on Pattern Analysis and Machine Intelligence 44, 9 (2021), 5335--5348.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00437"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2008.2001379"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3386569.3392427"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/2393347.2396442"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073653"},{"key":"e_1_3_2_1_25_1","volume-title":"Continuous control with deep reinforcement learning. arXiv preprint arXiv:1509.02971","author":"Lillicrap Timothy P","year":"2015","unstructured":"Timothy P Lillicrap, Jonathan J Hunt, Alexander Pritzel, Nicolas Heess, Tom Erez, Yuval Tassa, David Silver, and Daan Wierstra. 2015. Continuous control with deep reinforcement learning. arXiv preprint arXiv:1509.02971 (2015)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/365024.365310"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1023\/B:VISI.0000029664.99615.94"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547910"},{"key":"e_1_3_2_1_29_1","volume-title":"Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, et al.","author":"Minderer Matthias","year":"2022","unstructured":"Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, et al. 2022. Simple open-vocabulary object detection with vision transformers. arXiv preprint arXiv:2205.06230 (2022)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.5555\/2354409.2354807"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3448981"},{"key":"e_1_3_2_1_32_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_33_1","volume-title":"Computer Graphics Forum","author":"Ronfard R\u00e9mi","unstructured":"R\u00e9mi Ronfard. 2021. Film directing for computer games and animation. In Computer Graphics Forum, Vol. 40. Wiley Online Library, 713--730."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00530-004-0132-9"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/2393347.2393373"},{"key":"e_1_3_2_1_36_1","volume-title":"Automated Video Mashups: Research and Challenges. MediaSync: Handbook on Multimedia Synchronization","author":"Saini Mukesh Kumar","year":"2018","unstructured":"Mukesh Kumar Saini and Wei Tsang Ooi. 2018. Automated Video Mashups: Research and Challenges. MediaSync: Handbook on Multimedia Synchronization (2018), 167--190."},{"key":"e_1_3_2_1_37_1","volume-title":"Proximal policy optimization algorithms. arXiv preprint arXiv:1707.06347","author":"Schulman John","year":"2017","unstructured":"John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov. 2017. Proximal policy optimization algorithms. arXiv preprint arXiv:1707.06347 (2017)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/1873951.1874023"},{"key":"e_1_3_2_1_39_1","volume-title":"AI video editing tools","author":"Soe Than Htut","year":"2021","unstructured":"Than Htut Soe. 2021. AI video editing tools. What editors want and how far is AI from delivering? arXiv preprint arXiv:2109.07809 (2021)."},{"key":"e_1_3_2_1_40_1","volume-title":"Policy gradient methods for reinforcement learning with function approximation. Advances in neural information processing systems 12","author":"Sutton Richard S","year":"1999","unstructured":"Richard S Sutton, David McAllester, Satinder Singh, and Yishay Mansour. 1999. Policy gradient methods for reinforcement learning with function approximation. Advances in neural information processing systems 12 (1999)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00675"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2006.886292"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00530-008-0112-6"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/MMUL.2015.75"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISM.2014.44"},{"key":"e_1_3_2_1_46_1","unstructured":"Hui-Yin Wu and Arnav Jhala. 2018. A Joint Attention Model for Automated Editing.. In INT\/WICED@ AIIDE."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2015.2416554"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548268"},{"key":"e_1_3_2_1_49_1","volume-title":"Tel Aviv","author":"Xu Mengde","year":"2022","unstructured":"Mengde Xu, Zheng Zhang, Fangyun Wei, Yutong Lin, Yue Cao, Han Hu, and Xiang Bai. 2022. A Simple Baseline for Open-Vocabulary Semantic Segmentation with Pre-trained Vision-Language Model. In Computer Vision-ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23-27, 2022, Proceedings, Part XXIX. Springer, 736--753."},{"key":"e_1_3_2_1_50_1","volume-title":"Computable Framework For Live Sport Broadcast Directing. In 2019 IEEE International Symposium on Multimedia (ISM). IEEE, 239--2391","author":"Yang Danqing","year":"2019","unstructured":"Danqing Yang, Longfei Zhang, Yufeng Wu, Shugang Li, Dong Liang, and Gangyi Ding. 2019. Computable Framework For Live Sport Broadcast Directing. In 2019 IEEE International Symposium on Multimedia (ISM). IEEE, 239--2391."},{"key":"e_1_3_2_1_51_1","volume-title":"Enabling Automatic Cinematography with Reinforcement Learning. In 2022 IEEE 5th International Conference on Multimedia Information Processing and Retrieval (MIPR). IEEE, 103--108","author":"Yu Zixiao","year":"2022","unstructured":"Zixiao Yu, Chenyu Yu, Haohong Wang, and Jian Ren. 2022. Enabling Automatic Cinematography with Reinforcement Learning. In 2022 IEEE 5th International Conference on Multimedia Information Processing and Retrieval (MIPR). IEEE, 103--108."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"crossref","unstructured":"Xinrong Zhang Yanghao Li Yuxing Han and Jiangtao Wen. 2022. AI Video Editing: a Survey. (2022).","DOI":"10.20944\/preprints202201.0016.v1"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611878","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3611878","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:01:12Z","timestamp":1755820872000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611878"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":52,"alternative-id":["10.1145\/3581783.3611878","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3611878","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}