{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T09:08:30Z","timestamp":1765357710980,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":17,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3684998","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"11255-11257","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["AssistEditor: Multi-Agent Collaboration for GUI Workflow Automation in Video Creation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8494-3492","authenticated-orcid":false,"given":"Difei","family":"Gao","sequence":"first","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-9741-5028","authenticated-orcid":false,"given":"Siyuan","family":"Hu","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5953-0442","authenticated-orcid":false,"given":"Zechen","family":"Bai","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2568-2346","authenticated-orcid":false,"given":"Qinghong","family":"Lin","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7681-2166","authenticated-orcid":false,"given":"Mike Zheng","family":"Shou","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Deng Xiang","year":"2024","unstructured":"Xiang Deng, Yu Gu, Boyuan Zheng, Shijie Chen, Sam Stevens, Boshi Wang, Huan Sun, and Yu Su. 2024. Mind2web: Towards a generalist agent for the web. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_3_1","volume-title":"Assistgui: Task-oriented desktop graphical user interface automation. arXiv preprint arXiv:2312.13108","author":"Gao Difei","year":"2023","unstructured":"Difei Gao, Lei Ji, Zechen Bai, Mingyu Ouyang, Peiran Li, Dongxing Mao, Qinchen Wu, Weichen Zhang, Peiyi Wang, Xiangwu Guo, et al. 2023. Assistgui: Task-oriented desktop graphical user interface automation. arXiv preprint arXiv:2312.13108 (2023)."},{"key":"e_1_3_2_1_4_1","volume-title":"AI software engineer. https:\/\/www.cognition-labs.com\/introducing-devin Retrieved","author":"Labs Cognition","year":"2024","unstructured":"Cognition Labs. 2024. Devin, AI software engineer. https:\/\/www.cognition-labs.com\/introducing-devin Retrieved April 12, 2024 from"},{"key":"e_1_3_2_1_5_1","first-page":"9459","article-title":"Retrieval-augmented generation for knowledge-intensive nlp tasks","volume":"33","author":"Lewis Patrick","year":"2020","unstructured":"Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K\u00fcttler, Mike Lewis, Wen-tau Yih, Tim Rockt\u00e4schel, et al. 2020. Retrieval-augmented generation for knowledge-intensive nlp tasks. Advances in Neural Information Processing Systems, Vol. 33 (2020), 9459--9474.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_6_1","first-page":"51991","article-title":"Camel: Communicative agents for\" mind\" exploration of large language model society","volume":"36","author":"Li Guohao","year":"2023","unstructured":"Guohao Li, Hasan Hammoud, Hani Itani, Dmitrii Khizbullin, and Bernard Ghanem. 2023. Camel: Communicative agents for\" mind\" exploration of large language model society. Advances in Neural Information Processing Systems, Vol. 36 (2023), 51991--52008.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00262"},{"key":"e_1_3_2_1_8_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2024. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_9_1","unstructured":"Meta. 2024. Introducing Meta Llama 3: The most capable openly available LLM to date. https:\/\/ai.meta.com\/blog\/meta-llama-3\/ Accessed: 2024-04--18."},{"key":"e_1_3_2_1_10_1","volume-title":"Christoforos Nalmpantis, Ram Pasunuru, Roberta Raileanu, Baptiste Rozi\u00e8re, Timo Schick, Jane Dwivedi-Yu, Asli Celikyilmaz, et al.","author":"Mialon Gr\u00e9goire","year":"2023","unstructured":"Gr\u00e9goire Mialon, Roberto Dess`i, Maria Lomeli, Christoforos Nalmpantis, Ram Pasunuru, Roberta Raileanu, Baptiste Rozi\u00e8re, Timo Schick, Jane Dwivedi-Yu, Asli Celikyilmaz, et al. 2023. Augmented language models: a survey. arXiv preprint arXiv:2302.07842 (2023)."},{"key":"e_1_3_2_1_12_1","volume-title":"Android in the wild: A large-scale dataset for android device control. arXiv preprint arXiv:2307.10088","author":"Rawles Christopher","year":"2023","unstructured":"Christopher Rawles, Alice Li, Daniel Rodriguez, Oriana Riva, and Timothy Lillicrap. 2023. Android in the wild: A large-scale dataset for android device control. arXiv preprint arXiv:2307.10088 (2023)."},{"key":"e_1_3_2_1_13_1","unstructured":"Toran Bruce Richards. 2023. Auto-GPT: An Autonomous GPT-4 Experiment."},{"key":"e_1_3_2_1_14_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Shaw Peter","year":"2024","unstructured":"Peter Shaw, Mandar Joshi, James Cohan, Jonathan Berant, Panupong Pasupat, Hexiang Hu, Urvashi Khandelwal, Kenton Lee, and Kristina N Toutanova. 2024. From pixels to ui actions: Learning to follow instructions via graphical user interfaces. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_15_1","volume-title":"Junning Zhao, Qian Liu, Che Liu, et al.","author":"Xie Tianbao","year":"2023","unstructured":"Tianbao Xie, Fan Zhou, Zhoujun Cheng, Peng Shi, Luoxuan Weng, Yitao Liu, Toh Jing Hua, Junning Zhao, Qian Liu, Che Liu, et al. 2023. Openagents: An open platform for language agents in the wild. arXiv preprint arXiv:2310.10634 (2023)."},{"key":"e_1_3_2_1_16_1","volume-title":"Android in the Zoo: Chain-of-Action-Thought for GUI Agents. arXiv preprint arXiv:2403.02713","author":"Zhang Jiwen","year":"2024","unstructured":"Jiwen Zhang, Jihao Wu, Yihua Teng, Minghui Liao, Nuo Xu, Xiao Xiao, Zhongyu Wei, and Duyu Tang. 2024. Android in the Zoo: Chain-of-Action-Thought for GUI Agents. arXiv preprint arXiv:2403.02713 (2024)."},{"key":"e_1_3_2_1_17_1","volume-title":"AgentStudio: A Toolkit for Building General Virtual Agents. arXiv preprint arXiv:2403.17918","author":"Zheng Longtao","year":"2024","unstructured":"Longtao Zheng, Zhiyuan Huang, Zhenghai Xue, Xinrun Wang, Bo An, and Shuicheng Yan. 2024. AgentStudio: A Toolkit for Building General Virtual Agents. arXiv preprint arXiv:2403.17918 (2024)."},{"key":"e_1_3_2_1_18_1","volume-title":"Webarena: A realistic web environment for building autonomous agents. arXiv preprint arXiv:2307.13854","author":"Zhou Shuyan","year":"2023","unstructured":"Shuyan Zhou, Frank F Xu, Hao Zhu, Xuhui Zhou, Robert Lo, Abishek Sridhar, Xianyi Cheng, Yonatan Bisk, Daniel Fried, Uri Alon, et al. 2023. Webarena: A realistic web environment for building autonomous agents. arXiv preprint arXiv:2307.13854 (2023)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3684998","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3684998","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:28Z","timestamp":1750295848000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3684998"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":17,"alternative-id":["10.1145\/3664647.3684998","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3684998","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}