{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T16:01:28Z","timestamp":1774022488839,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":60,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,8,10]]},"DOI":"10.1145\/3721238.3730761","type":"proceedings-article","created":{"date-parts":[[2025,7,23]],"date-time":"2025-07-23T08:42:43Z","timestamp":1753260163000},"page":"1-11","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["EditDuet: A Multi-Agent System for Video Non-Linear Editing"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3227-9195","authenticated-orcid":false,"given":"Marcelo","family":"Sandoval-Casta\u00f1eda","sequence":"first","affiliation":[{"name":"TTIC, Chicago, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8656-543X","authenticated-orcid":false,"given":"Bryan","family":"Russell","sequence":"additional","affiliation":[{"name":"Adobe, San Francisco, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2554-5301","authenticated-orcid":false,"given":"Josef","family":"Sivic","sequence":"additional","affiliation":[{"name":"Adobe, San Francisco, USA and Czech Institute of Informatics, Robotics and Cybernetics, Czech Technical University, Prague, Czech Republic"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4700-9398","authenticated-orcid":false,"given":"Gregory","family":"Shakhnarovich","sequence":"additional","affiliation":[{"name":"TTIC, Chicago, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3129-1985","authenticated-orcid":false,"given":"Fabian","family":"Caba Heilbron","sequence":"additional","affiliation":[{"name":"Adobe, San Jose, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,7,27]]},"reference":[{"key":"e_1_3_3_3_2_1","unstructured":"Adobe. 2025. Adobe Premiere Pro. (2025). https:\/\/www.adobe.com\/products\/premiere.html"},{"key":"e_1_3_3_3_3_1","doi-asserted-by":"crossref","unstructured":"Ido Arev Hyun\u00a0Soo Park Yaser Sheikh Jessica Hodgins and Ariel Shamir. 2014. Automatic editing of footage from multiple social cameras. ACM TOG 33 4 (2014) 1\u201311.","DOI":"10.1145\/2601097.2601198"},{"key":"e_1_3_3_3_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20074-8_12"},{"key":"e_1_3_3_3_5_1","unstructured":"Avid. 2025. Avid Media Composer. (2025). https:\/\/www.avid.com\/media-composer"},{"key":"e_1_3_3_3_6_1","doi-asserted-by":"crossref","unstructured":"Floraine Berthouzoz Wilmot Li and Maneesh Agrawala. 2012. Tools for placing cuts and transitions in interview video. ACM TOG 31 4 (2012) 1\u20138.","DOI":"10.1145\/2185520.2185563"},{"key":"e_1_3_3_3_7_1","unstructured":"Blackmagic. 2025. DaVinci Resolve 19. (2025). https:\/\/www.blackmagicdesign.com\/products\/davinciresolve\/"},{"key":"e_1_3_3_3_8_1","doi-asserted-by":"crossref","unstructured":"Ted Byrt Janet Bishop and John\u00a0B Carlin. 1993. Bias prevalence and kappa. Journal of clinical epidemiology 46 5 (1993) 423\u2013429.","DOI":"10.1016\/0895-4356(93)90018-V"},{"key":"e_1_3_3_3_9_1","doi-asserted-by":"crossref","unstructured":"Mia Chiquier Utkarsh Mall and Carl Vondrick. 2024. Evolving Interpretable Visual Classifiers with Large Language Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.09941 (2024).","DOI":"10.1007\/978-3-031-73039-9_11"},{"key":"e_1_3_3_3_10_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72920-1_18"},{"key":"e_1_3_3_3_11_1","unstructured":"Qingxiu Dong Lei Li Damai Dai Ce Zheng Jingyuan Ma Rui Li Heming Xia Jingjing Xu Zhiyong Wu Tianyu Liu et\u00a0al. 2022. A survey on in-context learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2301.00234 (2022)."},{"key":"e_1_3_3_3_12_1","unstructured":"Yuhao Du Shunian Chen Wenbo Zan Peizhao Li Mingxuan Wang Dingjie Song Bo Li Yan Hu and Benyou Wang. 2024. BlenderLLM: Training Large Language Models for Computer-Aided Design with Self-improvement. arxiv:https:\/\/arXiv.org\/abs\/2412.14203\u00a0[cs.HC] https:\/\/arxiv.org\/abs\/2412.14203"},{"key":"e_1_3_3_3_13_1","unstructured":"Abhimanyu Dubey Abhinav Jauhri and et. al. 2024. The Llama 3 Herd of Models. arxiv:https:\/\/arXiv.org\/abs\/2407.21783\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2407.21783"},{"key":"e_1_3_3_3_14_1","unstructured":"EditStock. 2025. EditStock. (2025). https:\/\/editstock.com"},{"key":"e_1_3_3_3_15_1","unstructured":"Alex Graves. 2013. Generating sequences with recurrent neural networks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1308.0850 (2013)."},{"key":"e_1_3_3_3_16_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72989-8_11"},{"key":"e_1_3_3_3_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01436"},{"key":"e_1_3_3_3_18_1","unstructured":"Tuomas Haarnoja Aurick Zhou Kristian Hartikainen George Tucker Sehoon Ha Jie Tan Vikash Kumar Henry Zhu Abhishek Gupta Pieter Abbeel et\u00a0al. 2018. Soft actor-critic algorithms and applications. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1812.05905 (2018)."},{"key":"e_1_3_3_3_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00140"},{"key":"e_1_3_3_3_20_1","volume-title":"ICML","author":"Hu Ziniu","year":"2024","unstructured":"Ziniu Hu, Ahmet Iscen, Aashi Jain, Thomas Kipf, Yisong Yue, David\u00a0A Ross, Cordelia Schmid, and Alireza Fathi. 2024. SceneCraft: An LLM Agent for Synthesizing 3D Scenes as Blender Code. In ICML."},{"key":"e_1_3_3_3_21_1","first-page":"297","volume-title":"ECCV","author":"Huang Ian","year":"2024","unstructured":"Ian Huang, Guandao Yang, and Leonidas Guibas. 2024b. Blenderalchemy: Editing 3d graphics with vision-language models. In ECCV. Springer, 297\u2013314."},{"key":"e_1_3_3_3_22_1","unstructured":"Kaixuan Huang Yuanhao Qu Henry Cousins William\u00a0A Johnson Di Yin Mihir Shah Denny Zhou Russ Altman Mengdi Wang and Le Cong. 2024a. Crispr-GPT: An LLM agent for automated design of gene-editing experiments. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.18021 (2024)."},{"key":"e_1_3_3_3_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3290605.3300311"},{"key":"e_1_3_3_3_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581494"},{"key":"e_1_3_3_3_25_1","doi-asserted-by":"crossref","unstructured":"Eakta Jain Yaser Sheikh Ariel Shamir and Jessica Hodgins. 2015. Gaze-driven video re-editing. ACM TOG 34 2 (2015) 1\u201312.","DOI":"10.1145\/2699644"},{"key":"e_1_3_3_3_26_1","unstructured":"Joongwon Kim Bhargavi Paranjape Tushar Khot and Hannaneh Hajishirzi. 2024a. Husky: A Unified Open-Source Language Agent for Multi-Step Reasoning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.06469 (2024)."},{"key":"e_1_3_3_3_27_1","unstructured":"Seungone Kim Juyoung Suk Shayne Longpre Bill\u00a0Yuchen Lin Jamin Shin Sean Welleck Graham Neubig Moontae Lee Kyungjae Lee and Minjoon Seo. 2024b. Prometheus 2: An open source language model specialized in evaluating other language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.01535 (2024)."},{"key":"e_1_3_3_3_28_1","unstructured":"Vijay Konda and John Tsitsiklis. 1999. Actor-critic algorithms. NeurIPS 12 (1999)."},{"key":"e_1_3_3_3_29_1","doi-asserted-by":"crossref","unstructured":"Mackenzie Leake Abe Davis Anh Truong and Maneesh Agrawala. 2017. Computational video editing for dialogue-driven scenes. ACM TOG (2017).","DOI":"10.1145\/3072959.3073653"},{"key":"e_1_3_3_3_30_1","doi-asserted-by":"crossref","unstructured":"Dawon Lee Jung\u00a0Eun Yoo Kyungmin Cho Bumki Kim Gyeonghun Im and Junyong Noh. 2022. PopStage: The Generation of Stage Cross-Editing Video based on Spatio-Temporal Matching. ACM TOG 41 6 (2022) 1\u201313.","DOI":"10.1145\/3550454.3555467"},{"key":"e_1_3_3_3_31_1","unstructured":"Seongyun Lee Seungone Kim Sue\u00a0Hyun Park Geewook Kim and Minjoon Seo. 2024. Prometheusvision: Vision-language model as a judge for fine-grained evaluation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.06591 (2024)."},{"key":"e_1_3_3_3_32_1","first-page":"366","volume-title":"ECCV","author":"Lin Zhiqiu","year":"2024","unstructured":"Zhiqiu Lin, Deepak Pathak, Baiqi Li, Jiayao Li, Xide Xia, Graham Neubig, Pengchuan Zhang, and Deva Ramanan. 2024. Evaluating text-to-visual generation with image-to-text generation. In ECCV. Springer, 366\u2013384."},{"key":"e_1_3_3_3_33_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_2"},{"key":"e_1_3_3_3_34_1","unstructured":"Aman Madaan Niket Tandon Prakhar Gupta Skyler Hallinan Luyu Gao Sarah Wiegreffe Uri Alon Nouha Dziri Shrimai Prabhumoye Yiming Yang et\u00a0al. 2024. Self-refine: Iterative refinement with self-feedback. NeurIPS 36 (2024)."},{"key":"e_1_3_3_3_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01257"},{"key":"e_1_3_3_3_36_1","volume-title":"In the Blink of an Eye : a Perspective on Film Editing","author":"Murch Walter","year":"2001","unstructured":"Walter Murch. 2001. In the Blink of an Eye : a Perspective on Film Editing. Silman-James Press."},{"key":"e_1_3_3_3_37_1","volume-title":"ICML","author":"Murty Shikhar","year":"2024","unstructured":"Shikhar Murty, Christopher\u00a0D. Manning, Peter Shaw, Mandar Joshi, and Kenton Lee. 2024. BAGEL: Bootstrapping Agents by Guiding Exploration with Language. In ICML."},{"key":"e_1_3_3_3_38_1","unstructured":"OpenAI. 2023. GPT-4V(ision) Technical Work and Authors. (2023). https:\/\/cdn.openai.com\/contributions\/gpt-4v.pdf"},{"key":"e_1_3_3_3_39_1","unstructured":"Alejandro Pardo Jui-Hsien Wang Bernard Ghanem Josef Sivic Bryan Russell and Fabian\u00a0Caba Heilbron. 2024. Generative Timelines for Instructed Visual Assembly. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2411.12293 (2024)."},{"key":"e_1_3_3_3_40_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-04881-4_48"},{"key":"e_1_3_3_3_41_1","first-page":"8748","volume-title":"ICML","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In ICML. PMLR, 8748\u20138763."},{"key":"e_1_3_3_3_42_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58621-8_2"},{"key":"e_1_3_3_3_43_1","doi-asserted-by":"crossref","unstructured":"Bernardino Romera-Paredes Mohammadamin Barekatain Alexander Novikov Matej Balog M\u00a0Pawan Kumar Emilien Dupont Francisco\u00a0JR Ruiz Jordan\u00a0S Ellenberg Pengming Wang Omar Fawzi et\u00a0al. 2024. Mathematical discoveries from program search with large language models. Nature (2024).","DOI":"10.1038\/s41586-023-06924-6"},{"key":"e_1_3_3_3_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01107"},{"key":"e_1_3_3_3_45_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19839-7_17"},{"key":"e_1_3_3_3_46_1","doi-asserted-by":"crossref","unstructured":"Jiatian Sun Longxiulin Deng Triantafyllos Afouras Andrew Owens and Abe Davis. 2023. Eventfulness for interactive video alignment. ACM TOG 42 4 (2023) 1\u201310.","DOI":"10.1145\/3592118"},{"key":"e_1_3_3_3_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01092"},{"key":"e_1_3_3_3_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3640543.3645164"},{"key":"e_1_3_3_3_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3640543.3645143"},{"key":"e_1_3_3_3_50_1","doi-asserted-by":"crossref","unstructured":"Oliver Wang Christopher Schroers Henning Zimmer Markus Gross and Alexander Sorkine-Hornung. 2014. Videosnapping: Interactive synchronization of multiple videos. ACM TOG 33 4 (2014) 1\u201310.","DOI":"10.1145\/2601097.2601208"},{"key":"e_1_3_3_3_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3643834.3661591"},{"key":"e_1_3_3_3_52_1","doi-asserted-by":"crossref","unstructured":"Xiaohan Wang Yuhui Zhang Orr Zohar and Serena Yeung-Levy. 2024c. VideoAgent: Long-form Video Understanding with Large Language Model as Agent. ECCV (2024).","DOI":"10.1007\/978-3-031-72989-8_4"},{"key":"e_1_3_3_3_53_1","unstructured":"Brandon\u00a0T Willard and R\u00e9mi Louf. 2023. Efficient Guided Generation for LLMs. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.09702 (2023)."},{"key":"e_1_3_3_3_54_1","volume-title":"NeurIPS","author":"Wu Shirley","year":"2024","unstructured":"Shirley Wu, Shiyu Zhao, Qian Huang, Kexin Huang, Michihiro Yasunaga, Kaidi Cao, Vassilis\u00a0N. Ioannidis, Karthik Subbian, Jure Leskovec, and James Zou. 2024b. AvaTaR: Optimizing LLM Agents for Tool Usage via Contrastive Reasoning. In NeurIPS. https:\/\/openreview.net\/forum?id=N4quRxE19p"},{"key":"e_1_3_3_3_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02098"},{"key":"e_1_3_3_3_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548268"},{"key":"e_1_3_3_3_57_1","unstructured":"John Yang Carlos\u00a0E Jimenez Alexander Wettig Kilian Lieret Shunyu Yao Karthik Narasimhan and Ofir Press. 2024. SWE-agent: Agent-computer interfaces enable automated software engineering. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.15793 (2024)."},{"key":"e_1_3_3_3_58_1","unstructured":"Shunyu Yao Jeffrey Zhao Dian Yu Nan Du Izhak Shafran Karthik Narasimhan and Yuan Cao. 2022. ReAct: Synergizing reasoning and acting in language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2210.03629 (2022)."},{"key":"e_1_3_3_3_59_1","unstructured":"Xinlu Zhang Yujie Lu Weizhi Wang An Yan Jun Yan Lianke Qin Heng Wang Xifeng Yan William\u00a0Yang Wang and Linda\u00a0Ruth Petzold. 2023. GPT-4V(ision) as a Generalist Evaluator for Vision-Language Tasks. arxiv:https:\/\/arXiv.org\/abs\/2311.01361\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2311.01361"},{"key":"e_1_3_3_3_60_1","unstructured":"Yuanhan Zhang Bo Li haotian Liu Yong\u00a0jae Lee Liangke Gui Di Fu Jiashi Feng Ziwei Liu and Chunyuan Li. 2024. LLaVA-NeXT: A Strong Zero-shot Video Understanding Model. https:\/\/llava-vl.github.io\/blog\/2024-04-30-llava-next-video\/"},{"key":"e_1_3_3_3_61_1","volume-title":"NeurIPS","author":"Zheng Lianmin","year":"2024","unstructured":"Lianmin Zheng, Wei-Lin Chiang, Ying Sheng, Siyuan Zhuang, Zhanghao Wu, Yonghao Zhuang, Zi Lin, Zhuohan Li, Dacheng Li, Eric\u00a0P. Xing, Hao Zhang, Joseph\u00a0E. Gonzalez, and Ion Stoica. 2024. Judging LLM-as-a-judge with MT-bench and Chatbot Arena. In NeurIPS. Curran Associates Inc."}],"event":{"name":"SIGGRAPH Conference Papers '25: Special Interest Group on Computer Graphics and Interactive Techniques Conference Conference Papers","location":"Vancouver BC Canada","acronym":"SIGGRAPH Conference Papers '25","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["Proceedings of the Special Interest Group on Computer Graphics and Interactive Techniques Conference Conference Papers"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721238.3730761","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T15:04:25Z","timestamp":1774019065000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721238.3730761"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,27]]},"references-count":60,"alternative-id":["10.1145\/3721238.3730761","10.1145\/3721238"],"URL":"https:\/\/doi.org\/10.1145\/3721238.3730761","relation":{},"subject":[],"published":{"date-parts":[[2025,7,27]]},"assertion":[{"value":"2025-07-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}