{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:41:22Z","timestamp":1755823282891,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":45,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612263","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:30Z","timestamp":1698391650000},"page":"5705-5715","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Visual Captioning at Will: Describing Images and Videos Guided by a Few Stylized Sentences"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-8924-5259","authenticated-orcid":false,"given":"Dingyi","family":"Yang","sequence":"first","affiliation":[{"name":"School of Information, Renmin University of China, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7682-1775","authenticated-orcid":false,"given":"Hongyu","family":"Chen","sequence":"additional","affiliation":[{"name":"Alibaba Group, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5116-3454","authenticated-orcid":false,"given":"Xinglin","family":"Hou","sequence":"additional","affiliation":[{"name":"Alibaba Group, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1381-2692","authenticated-orcid":false,"given":"Tiezheng","family":"Ge","sequence":"additional","affiliation":[{"name":"Alibaba Group, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1665-3025","authenticated-orcid":false,"given":"Yuning","family":"Jiang","sequence":"additional","affiliation":[{"name":"Alibaba Group, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6486-6020","authenticated-orcid":false,"given":"Qin","family":"Jin","sequence":"additional","affiliation":[{"name":"School of Information, Renmin University of China, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65--72","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65--72."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475173"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018151"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01249-6_32"},{"key":"e_1_3_2_1_5_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.108"},{"key":"e_1_3_2_1_7_1","volume-title":"I Can't Believe There's No Images! Learning Visual Tasks Using only Language Data. arXiv preprint arXiv:2211.09778","author":"Gu Sophia","year":"2022","unstructured":"Sophia Gu, Christopher Clark, and Aniruddha Kembhavi. 2022. I Can't Believe There's No Images! Learning Visual Tasks Using only Language Data. arXiv preprint arXiv:2211.09778 (2022)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00433"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2006.100"},{"key":"e_1_3_2_1_10_1","volume-title":"Ronan Le Bras, and Yejin Choi","author":"Hessel Jack","year":"2021","unstructured":"Jack Hessel, Ari Holtzman, Maxwell Forbes, Ronan Le Bras, and Yejin Choi. 2021. Clipscore: A reference-free evaluation metric for image captioning. arXiv preprint arXiv:2104.08718 (2021)."},{"key":"e_1_3_2_1_11_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 17980--17989","author":"Hu Xiaowei","year":"2022","unstructured":"Xiaowei Hu, Zhe Gan, Jianfeng Wang, Zhengyuan Yang, Zicheng Liu, Yumao Lu, and Lijuan Wang. 2022a. Scaling up vision-language pre-training for image captioning. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 17980--17989."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544903.3544906"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475662"},{"key":"e_1_3_2_1_15_1","volume-title":"retrieve, generate: a simple approach to sentiment and style transfer. arXiv preprint arXiv:1804.06437","author":"Li Juncen","year":"2018","unstructured":"Juncen Li, Robin Jia, He He, and Percy Liang. 2018. Delete, retrieve, generate: a simple approach to sentiment and style transfer. arXiv preprint arXiv:1804.06437 (2018)."},{"key":"e_1_3_2_1_16_1","volume-title":"Taking an Emotional Look at Video Paragraph Captioning. arXiv preprint arXiv:2203.06356","author":"Li Qinyu","year":"2022","unstructured":"Qinyu Li, Tengpeng Li, Hanli Wang, and Chang Wen Chen. 2022. Taking an Emotional Look at Video Paragraph Captioning. arXiv preprint arXiv:2203.06356 (2022)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_18_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_19_1","volume-title":"Lili Mou, and Mauajama Firdaus.","author":"Luo Guoqing","year":"2023","unstructured":"Guoqing Luo, Yu Tong Han, Lili Mou, and Mauajama Firdaus. 2023. Prompt-Based Editing for Text Style Transfer. arXiv preprint arXiv:2301.11997 (2023)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v30i1.10475"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00896"},{"key":"e_1_3_2_1_23_1","volume-title":"Evaluating style transfer for text. arXiv preprint arXiv:1904.02295","author":"Mir Remi","year":"2019","unstructured":"Remi Mir, Bjarke Felbo, Nick Obradovich, and Iyad Rahwan. 2019. Evaluating style transfer for text. arXiv preprint arXiv:1904.02295 (2019)."},{"key":"e_1_3_2_1_24_1","volume-title":"Clipcap: Clip prefix for image captioning. arXiv preprint arXiv:2111.09734","author":"Mokady Ron","year":"2021","unstructured":"Ron Mokady, Amir Hertz, and Amit H Bermano. 2021. Clipcap: Clip prefix for image captioning. arXiv preprint arXiv:2111.09734 (2021)."},{"key":"e_1_3_2_1_25_1","volume-title":"Text-Only Training for Image Captioning using Noise-Injected CLIP. arXiv preprint arXiv:2211.00575","author":"Nukrai David","year":"2022","unstructured":"David Nukrai, Ron Mokady, and Amir Globerson. 2022. Text-Only Training for Image Captioning using Noise-Injected CLIP. arXiv preprint arXiv:2211.00575 (2022)."},{"key":"e_1_3_2_1_26_1","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311--318","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311--318."},{"key":"e_1_3_2_1_27_1","volume-title":"International Conference on Machine Learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.5555\/3455716.3455856"},{"key":"e_1_3_2_1_29_1","volume-title":"A recipe for arbitrary text style transfer with large language models. arXiv preprint arXiv:2109.03910","author":"Reif Emily","year":"2021","unstructured":"Emily Reif, Daphne Ippolito, Ann Yuan, Andy Coenen, Chris Callison-Burch, and Jason Wei. 2021. A recipe for arbitrary text style transfer with large language models. arXiv preprint arXiv:2109.03910 (2021)."},{"key":"e_1_3_2_1_30_1","volume-title":"TextSETTR: Few-shot text style extraction and tunable targeted restyling. arXiv preprint arXiv:2010.03802","author":"Riley Parker","year":"2020","unstructured":"Parker Riley, Noah Constant, Mandy Guo, Girish Kumar, David Uthus, and Zarana Parekh. 2020. TextSETTR: Few-shot text style extraction and tunable targeted restyling. arXiv preprint arXiv:2010.03802 (2020)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01280"},{"key":"e_1_3_2_1_32_1","volume-title":"Sarah Kreps, et al.","author":"Solaiman Irene","year":"2019","unstructured":"Irene Solaiman, Miles Brundage, Jack Clark, Amanda Askell, Ariel Herbert-Voss, Jeff Wu, Alec Radford, Gretchen Krueger, Jong Wook Kim, Sarah Kreps, et al. 2019. Release strategies and the social impacts of language models. arXiv preprint arXiv:1908.09203 (2019)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.21437\/ICSLP.2002-303"},{"key":"e_1_3_2_1_34_1","volume-title":"Prompt-and-rerank: A method for zero-shot and few-shot arbitrary textual style transfer with small language models. arXiv preprint arXiv:2205.11503","author":"Suzgun Mirac","year":"2022","unstructured":"Mirac Suzgun, Luke Melas-Kyriazi, and Dan Jurafsky. 2022. Prompt-and-rerank: A method for zero-shot and few-shot arbitrary textual style transfer with small language models. arXiv preprint arXiv:2205.11503 (2022)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548295"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3479207"},{"key":"e_1_3_2_1_37_1","article-title":"Visualizing data using t-SNE","volume":"9","author":"der Maaten Laurens Van","year":"2008","unstructured":"Laurens Van der Maaten and Geoffrey Hinton. 2008. Visualizing data using t-SNE. Journal of machine learning research, Vol. 9, 11 (2008).","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_2_1_39_1","volume-title":"Git: A generative image-to-text transformer for vision and language. arXiv preprint arXiv:2205.14100","author":"Wang Jianfeng","year":"2022","unstructured":"Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, and Lijuan Wang. 2022. Git: A generative image-to-text transformer for vision and language. arXiv preprint arXiv:2205.14100 (2022)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-023-01752-7"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"e_1_3_2_1_42_1","volume-title":"International Conference on Machine Learning. PMLR, 10534--10543","author":"Xu Peng","year":"2020","unstructured":"Peng Xu, Jackie Chi Kit Cheung, and Yanshuai Cao. 2020. On variational learning of controllable representations for text without supervision. In International Conference on Machine Learning. PMLR, 10534--10543."},{"key":"e_1_3_2_1_43_1","volume-title":"Image captioning at will: A versatile scheme for effectively injecting sentiments into image descriptions. arXiv preprint arXiv:1801.10121","author":"You Quanzeng","year":"2018","unstructured":"Quanzeng You, Hailin Jin, and Jiebo Luo. 2018. Image captioning at will: A versatile scheme for effectively injecting sentiments into image descriptions. arXiv preprint arXiv:1801.10121 (2018)."},{"key":"e_1_3_2_1_44_1","volume-title":"ConZIC: Controllable Zero-shot Image Captioning by Sampling-Based Polishing. arXiv preprint arXiv:2303.02437","author":"Zeng Zequn","year":"2023","unstructured":"Zequn Zeng, Hao Zhang, Zhengjue Wang, Ruiying Lu, Dongsheng Wang, and Bo Chen. 2023. ConZIC: Controllable Zero-shot Image Captioning by Sampling-Based Polishing. arXiv preprint arXiv:2303.02437 (2023)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6998"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Ottawa ON Canada","acronym":"MM '23"},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612263","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612263","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:03:44Z","timestamp":1755821024000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612263"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":45,"alternative-id":["10.1145\/3581783.3612263","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612263","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}