{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T09:08:43Z","timestamp":1765357723196,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":63,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680724","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:41Z","timestamp":1729925981000},"page":"1924-1933","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Edit As You Wish: Video Caption Editing with Multi-grained User Control"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9809-8864","authenticated-orcid":false,"given":"Linli","family":"Yao","sequence":"first","affiliation":[{"name":"School of Computer Science, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-4483-262X","authenticated-orcid":false,"given":"Yuanmeng","family":"Zhang","sequence":"additional","affiliation":[{"name":"Alibaba Group, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-9366-994X","authenticated-orcid":false,"given":"Ziheng","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Information, Renmin University of China, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-7065-4671","authenticated-orcid":false,"given":"Xinglin","family":"Hou","sequence":"additional","affiliation":[{"name":"Alibaba Group, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1381-2692","authenticated-orcid":false,"given":"Tiezheng","family":"Ge","sequence":"additional","affiliation":[{"name":"Alibaba Group, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1665-3025","authenticated-orcid":false,"given":"Yuning","family":"Jiang","sequence":"additional","affiliation":[{"name":"Alibaba Group, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8241-9320","authenticated-orcid":false,"given":"Xu","family":"Sun","sequence":"additional","affiliation":[{"name":"School of Computer Science, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6486-6020","authenticated-orcid":false,"given":"Qin","family":"Jin","sequence":"additional","affiliation":[{"name":"School of Information, Renmin University of China, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01277"},{"key":"e_1_3_2_1_2_1","volume-title":"Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966 (2023)."},{"key":"e_1_3_2_1_3_1","volume-title":"Honeybee: Locality-enhanced Projector for Multimodal LLM. arXiv preprint arXiv:2312.06742","author":"Cha Junbum","year":"2023","unstructured":"Junbum Cha, Wooyoung Kang, Jonghwan Mun, and Byungseok Roh. 2023. Honeybee: Locality-enhanced Projector for Multimodal LLM. arXiv preprint arXiv:2312.06742 (2023)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2019.2896515"},{"key":"e_1_3_2_1_5_1","unstructured":"Aakanksha Chowdhery Sharan Narang Jacob Devlin Maarten Bosma Gaurav Mishra Adam Roberts Paul Barham Hyung Won Chung Charles Sutton Sebastian Gehrmann Parker Schuh Kensen Shi Sasha Tsvyashchenko Joshua Maynez Abhishek Rao Parker Barnes Yi Tay Noam Shazeer Vinodkumar Prabhakaran Emily Reif Nan Du Ben Hutchinson Reiner Pope James Bradbury Jacob Austin Michael Isard Guy Gur-Ari Pengcheng Yin Toju Duke Anselm Levskaya Sanjay Ghemawat Sunipa Dev Henryk Michalewski Xavier Garcia Vedant Misra Kevin Robinson Liam Fedus Denny Zhou Daphne Ippolito David Luan Hyeontaek Lim Barret Zoph Alexander Spiridonov Ryan Sepassi David Dohan Shivani Agrawal Mark Omernick Andrew M. Dai Thanumalayan Sankaranarayana Pillai Marie Pellat Aitor Lewkowycz Erica Moreira Rewon Child Oleksandr Polozov Katherine Lee Zongwei Zhou Xuezhi Wang Brennan Saeta Mark Diaz Orhan Firat Michele Catasta Jason Wei Kathy Meier-Hellstern Douglas Eck Jeff Dean Slav Petrov and Noah Fiedel. 2022. PaLM: Scaling Language Modeling with Pathways. CoRR Vol. abs\/2204.02311 (2022)."},{"key":"e_1_3_2_1_6_1","volume-title":"Junqi Zhao, Weisheng Wang, Boyang Albert Li, Pascale Fung, and Steven C. H. Hoi.","author":"Dai Wenliang","year":"2023","unstructured":"Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Albert Li, Pascale Fung, and Steven C. H. Hoi. 2023. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. ArXiv, Vol. abs\/2305.06500 (2023). https:\/\/api.semanticscholar.org\/CorpusID:258615266"},{"key":"e_1_3_2_1_7_1","volume-title":"Diverse and controllable image captioning with part-of-speech guidance. CVPR","author":"Deshpande Aditya","year":"2019","unstructured":"Aditya Deshpande, Jyoti Aneja, Liwei Wang, Alexander Schwing, and David A Forsyth. 2019. Diverse and controllable image captioning with part-of-speech guidance. CVPR (2019)."},{"key":"e_1_3_2_1_8_1","volume-title":"A survey for in-context learning. arXiv preprint arXiv:2301.00234","author":"Dong Qingxiu","year":"2022","unstructured":"Qingxiu Dong, Lei Li, Damai Dai, Ce Zheng, Zhiyong Wu, Baobao Chang, Xu Sun, Jingjing Xu, and Zhifang Sui. 2022. A survey for in-context learning. arXiv preprint arXiv:2301.00234 (2022)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.250"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1121\/1.2016299"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_2_1_12_1","unstructured":"Junnan Li Dongxu Li Silvio Savarese and Steven Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. arxiv: 2301.12597 [cs.CV]"},{"key":"e_1_3_2_1_13_1","volume-title":"International Conference on Machine Learning. PMLR, 12888--12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International Conference on Machine Learning. PMLR, 12888--12900."},{"key":"e_1_3_2_1_14_1","volume-title":"Videochat: Chat-centric video understanding. arXiv preprint arXiv:2305.06355","author":"Li KunChang","year":"2023","unstructured":"KunChang Li, Yinan He, Yi Wang, Yizhuo Li, Wenhai Wang, Ping Luo, Yali Wang, Limin Wang, and Yu Qiao. 2023. Videochat: Chat-centric video understanding. arXiv preprint arXiv:2305.06355 (2023)."},{"key":"e_1_3_2_1_15_1","volume-title":"UniFormerV2: Spatiotemporal Learning by Arming Image ViTs with Video UniFormer. arXiv preprint arXiv:2211.09552","author":"Li Kunchang","year":"2022","unstructured":"Kunchang Li, Yali Wang, Yinan He, Yizhuo Li, Yi Wang, Limin Wang, and Yu Qiao. 2022. UniFormerV2: Spatiotemporal Learning by Arming Image ViTs with Video UniFormer. arXiv preprint arXiv:2211.09552 (2022)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2984069"},{"key":"e_1_3_2_1_17_1","volume-title":"Video-LLaVA: Learning United Visual Representation by Alignment Before Projection. arXiv preprint arXiv:2311.10122","author":"Lin Bin","year":"2023","unstructured":"Bin Lin, Bin Zhu, Yang Ye, Munan Ning, Peng Jin, and Li Yuan. 2023. Video-LLaVA: Learning United Visual Representation by Alignment Before Projection. arXiv preprint arXiv:2311.10122 (2023)."},{"key":"e_1_3_2_1_18_1","volume-title":"Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74--81.","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74--81."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2973831"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3240508.3240632"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_2_1_22_1","volume-title":"Visual instruction tuning. arXiv preprint arXiv:2304.08485","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual instruction tuning. arXiv preprint arXiv:2304.08485 (2023)."},{"key":"e_1_3_2_1_23_1","volume-title":"Decoupled Weight Decay Regularization. In International Conference on Learning Representations.","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled Weight Decay Regularization. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_24_1","volume-title":"UniVL: A Unified Video and Language Pre-Training Model for Multimodal Understanding and Generation. arXiv preprint arXiv:2002.06353","author":"Luo Huaishao","year":"2020","unstructured":"Huaishao Luo, Lei Ji, Botian Shi, Haoyang Huang, Nan Duan, Tianrui Li, Jason Li, Taroon Bharti, and Ming Zhou. 2020. UniVL: A Unified Video and Language Pre-Training Model for Multimodal Understanding and Generation. arXiv preprint arXiv:2002.06353 (2020)."},{"key":"e_1_3_2_1_25_1","volume-title":"Valley: Video Assistant with Large Language model Enhanced abilitY. arXiv preprint arXiv:2306.07207","author":"Luo Ruipu","year":"2023","unstructured":"Ruipu Luo, Ziwang Zhao, Min Yang, Junwei Dong, Minghui Qiu, Pengcheng Lu, Tao Wang, and Zhongyu Wei. 2023. Valley: Video Assistant with Large Language model Enhanced abilitY. arXiv preprint arXiv:2306.07207 (2023)."},{"key":"e_1_3_2_1_26_1","unstructured":"OpenAI. 2022. OpenAI: Introducing ChatGPT. https:\/\/openai.com\/blog\/chatgpt"},{"key":"e_1_3_2_1_28_1","volume-title":"Training language models to follow instructions with human feedback. CoRR","author":"Ouyang Long","year":"2022","unstructured":"Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul F. Christiano, Jan Leike, and Ryan Lowe. 2022. Training language models to follow instructions with human feedback. CoRR, Vol. abs\/2203.02155 (2022)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01088"},{"key":"e_1_3_2_1_30_1","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311--318","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311--318."},{"key":"e_1_3_2_1_31_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In ICML."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2984066"},{"key":"e_1_3_2_1_33_1","volume-title":"TimeChat: A Time-sensitive Multimodal Large Language Model for Long Video Understanding. ArXiv","author":"Ren Shuhuai","year":"2051","unstructured":"Shuhuai Ren, Linli Yao, Shicheng Li, Xu Sun, and Lu Hou. 2023. TimeChat: A Time-sensitive Multimodal Large Language Model for Long Video Understanding. ArXiv, Vol. abs\/2312.02051 (2023)."},{"key":"e_1_3_2_1_34_1","unstructured":"Fawaz Sammani and Mahmoud Elsayed. 2019. Look and Modify: Modification Networks for Image Captioning."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00486"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2984062"},{"key":"e_1_3_2_1_37_1","volume-title":"Simple bert models for relation extraction and semantic role labeling. arXiv preprint arXiv:1904.05255","author":"Shi Peng","year":"2019","unstructured":"Peng Shi and Jimmy Lin. 2019. Simple bert models for relation extraction and semantic role labeling. arXiv preprint arXiv:1904.05255 (2019)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3351060"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01740"},{"key":"e_1_3_2_1_40_1","volume-title":"From show to tell: a survey on deep learning-based image captioning","author":"Stefanini Matteo","year":"2022","unstructured":"Matteo Stefanini, Marcella Cornia, Lorenzo Baraldi, Silvia Cascianelli, Giuseppe Fiameni, and Rita Cucchiara. 2022. From show to tell: a survey on deep learning-based image captioning. IEEE transactions on pattern analysis and machine intelligence, Vol. 45, 1 (2022), 539--559."},{"key":"e_1_3_2_1_41_1","volume-title":"Eva-clip: Improved training techniques for clip at scale. arXiv preprint arXiv:2303.15389","author":"Sun Quan","year":"2023","unstructured":"Quan Sun, Yuxin Fang, Ledell Wu, Xinlong Wang, and Yue Cao. 2023. Eva-clip: Improved training techniques for clip at scale. arXiv preprint arXiv:2303.15389 (2023)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3479207"},{"key":"e_1_3_2_1_43_1","volume-title":"LLaMA: Open and Efficient Foundation Language Models. CoRR","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, Aur\u00e9lien Rodriguez, Armand Joulin, Edouard Grave, and Guillaume Lample. 2023. LLaMA: Open and Efficient Foundation Language Models. CoRR (2023)."},{"key":"e_1_3_2_1_44_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.515"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"e_1_3_2_1_47_1","volume-title":"Controllable Video Captioning with POS Sequence Guidance Based on Gated Fusion Network. arXiv preprint arXiv:1908.10072","author":"Wang Bairui","year":"2019","unstructured":"Bairui Wang, Lin Ma, Wei Zhang, Wenhao Jiang, Jingwen Wang, and Wei Liu. 2019. Controllable Video Captioning with POS Sequence Guidance Based on Gated Fusion Network. arXiv preprint arXiv:1908.10072 (2019)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00468"},{"key":"e_1_3_2_1_49_1","volume-title":"Proceedings, Part XXXVI. 113--129","author":"Wang Zhen","year":"2022","unstructured":"Zhen Wang, Long Chen, Wenbo Ma, Guangxing Han, Yulei Niu, Jian Shao, and Jun Xiao. 2022. Explicit Image Caption Editing. In Computer Vision--ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23--27, 2022, Proceedings, Part XXXVI. 113--129."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"e_1_3_2_1_51_1","volume-title":"International conference on machine learning. PMLR","author":"Xu Kelvin","year":"2015","unstructured":"Kelvin Xu, Jimmy Ba, Ryan Kiros, Kyunghyun Cho, Aaron Courville, Ruslan Salakhudinov, Rich Zemel, and Yoshua Bengio. 2015. Show, attend and tell: Neural image caption generation with visual attention. In International conference on machine learning. PMLR, 2048--2057."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00107"},{"key":"e_1_3_2_1_53_1","volume-title":"Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese. arXiv preprint arXiv:2211.01335","author":"Yang An","year":"2022","unstructured":"An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, and Chang Zhou. 2022. Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese. arXiv preprint arXiv:2211.01335 (2022)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612263"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"crossref","unstructured":"Linli Yao Weijing Chen and Qin Jin. 2023. CapEnrich: Enriching Caption Semantics for Web Images via Cross-modal Pre-trained Knowledge. In TheWebConf.","DOI":"10.1145\/3543507.3583232"},{"key":"e_1_3_2_1_56_1","volume-title":"DeCo: Decoupling Token Compression from Semantic Abstraction in Multimodal Large Language Models. arXiv preprint arXiv:2405.20985","author":"Yao Linli","year":"2024","unstructured":"Linli Yao, Lei Li, Shuhuai Ren, Lean Wang, Yuanxin Liu, Xu Sun, and Lu Hou. 2024. DeCo: Decoupling Token Compression from Semantic Abstraction in Multimodal Large Language Models. arXiv preprint arXiv:2405.20985 (2024)."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20218"},{"key":"e_1_3_2_1_58_1","unstructured":"Qinghao Ye Haiyang Xu Guohai Xu Jiabo Ye Ming Yan Yiyang Zhou Junyang Wang Anwen Hu Pengcheng Shi Yaya Shi et al. 2023. mplug-owl: Modularization empowers large language models with multimodality. arXiv preprint arXiv:2304.14178 (2023)."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.347"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/3532627"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413908"},{"key":"e_1_3_2_1_62_1","volume-title":"Zixuan Ma, Yufei Xue, Jidong Zhai, Wenguang Chen, Peng Zhang, Yuxiao Dong, and Jie Tang.","author":"Zeng Aohan","year":"2022","unstructured":"Aohan Zeng, Xiao Liu, Zhengxiao Du, Zihan Wang, Hanyu Lai, Ming Ding, Zhuoyi Yang, Yifan Xu, Wendi Zheng, Xiao Xia, Weng Lam Tam, Zixuan Ma, Yufei Xue, Jidong Zhai, Wenguang Chen, Peng Zhang, Yuxiao Dong, and Jie Tang. 2022. GLM-130B: An Open Bilingual Pre-trained Model. , Vol. abs\/2210.02414 (2022)."},{"key":"e_1_3_2_1_63_1","volume-title":"Video-llama: An instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858","author":"Zhang Hang","year":"2023","unstructured":"Hang Zhang, Xin Li, and Lidong Bing. 2023. Video-llama: An instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858 (2023)."},{"key":"e_1_3_2_1_64_1","volume-title":"Attract me to Buy: Advertisement Copywriting Generation with Multimodal Multi-structured Information. arXiv preprint arXiv:2205.03534","author":"Zhang Zhipeng","year":"2022","unstructured":"Zhipeng Zhang, Xinglin Hou, Kai Niu, Zhongzhen Huang, Tiezheng Ge, Yuning Jiang, Qi Wu, and Peng Wang. 2022. Attract me to Buy: Advertisement Copywriting Generation with Multimodal Multi-structured Information. arXiv preprint arXiv:2205.03534 (2022). n"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680724","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680724","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:06:24Z","timestamp":1750291584000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680724"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":63,"alternative-id":["10.1145\/3664647.3680724","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680724","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}