{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T18:30:32Z","timestamp":1772908232617,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":57,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"the Supercomputing Center of the USTC"},{"name":"NSFC","award":["U20A20183 and 62021001"],"award-info":[{"award-number":["U20A20183 and 62021001"]}]},{"name":"MCC Lab of Information Science and Technology Institution, USTC"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612179","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"3686-3695","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Text-Only Training for Visual Storytelling"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0965-584X","authenticated-orcid":false,"given":"Yuechen","family":"Wang","sequence":"first","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1690-9836","authenticated-orcid":false,"given":"Wengang","family":"Zhou","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China &amp; Institute of Artificial Intelligence, Hefei Comprehensive National Science Center, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0918-7524","authenticated-orcid":false,"given":"Zhenbo","family":"Lu","sequence":"additional","affiliation":[{"name":"Institute of Artificial Intelligence, Hefei Comprehensive National Science Center, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2188-3028","authenticated-orcid":false,"given":"Houqiang","family":"Li","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China &amp; Institute of Artificial Intelligence, Hefei Comprehensive National Science Center, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"North American Chapter of the Association for Computational Linguistics","author":"Kenneth' Huang Ting-Hao","year":"2016","unstructured":"Ting-Hao 'Kenneth' Huang, Francis Ferraro, N. Mostafazadeh, Ishan Misra, Aish-warya Agrawal, Jacob Devlin, Ross B. Girshick, Xiaodong He, Pushmeet Kohli, Dhruv Batra, C. Lawrence Zitnick, Devi Parikh, Lucy Vanderwende, Michel Galley, and Margaret Mitchell. Visual storytelling. In North American Chapter of the Association for Computational Linguistics, 2016."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/P15-2017"},{"key":"e_1_3_2_1_3_1","first-page":"5356","volume-title":"International Joint Conference on Artificial Intelligence","author":"Yang Pengcheng","year":"2019","unstructured":"Pengcheng Yang, Fuli Luo, Peng Chen, Lei Li, Zhiyi Yin, Xiaodong He, and Xu Sun. Knowledgeable storyteller: A commonsense-driven generative model for visual storytelling. In International Joint Conference on Artificial Intelligence, pages 5356--5362, 2019."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6455"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01247"},{"key":"e_1_3_2_1_6_1","volume-title":"Improving language understanding by generative pre-training","author":"Radford Alec","year":"2018","unstructured":"Alec Radford and Karthik Narasimhan. Improving language understanding by generative pre-training. 2018."},{"key":"e_1_3_2_1_7_1","volume-title":"Language models are unsupervised multitask learners","author":"Radford Alec","year":"2019","unstructured":"Alec Radford, Jeff Wu, Rewon Child, David Luan, Dario Amodei, and Ilya Sutskever. Language models are unsupervised multitask learners. 2019."},{"key":"e_1_3_2_1_8_1","first-page":"1877","volume-title":"Advances in Neural Information Processing Systems","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel Ziegler, Jeffrey Wu, Clemens Winter, Chris Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. Language models are few-shot learners. In Advances in Neural Information Processing Systems, pages 1877--1901, 2020."},{"key":"e_1_3_2_1_9_1","first-page":"8748","volume-title":"International Conference on Machine Learning","volume":"139","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning, volume 139, pages 8748--8763, 2021."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01739"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-emnlp.299"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v31i1.10760"},{"key":"e_1_3_2_1_13_1","volume-title":"show and tell: A neural visual storyteller. arXiv preprint, abs\/1806.00738","author":"Gonzalez-Rico Diana","year":"2018","unstructured":"Diana Gonzalez-Rico and Gibran Fuentes Pineda. Contextualize, show and tell: A neural visual storyteller. arXiv preprint, abs\/1806.00738, 2018."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1083"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D17-1101"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018465"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6303"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3230934"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350918"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548161"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475236"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3487553.3524649"},{"key":"e_1_3_2_1_23_1","volume-title":"ACM Multimedia Asia","author":"Fan Ruichao","year":"2022","unstructured":"Ruichao Fan, Hanli Wang, Jinjing Gu, and Xianhui Liu. Visual storytelling with hierarchical bert semantic guidance. In ACM Multimedia Asia, 2022."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.coling-main.1"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.37"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.351"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3447548.3467418"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.471"},{"key":"e_1_3_2_1_29_1","volume-title":"Ctrl: A conditional transformer language model for controllable generation. arXiv preprint, abs\/1909.05858","author":"Keskar Nitish Shirish","year":"2019","unstructured":"Nitish Shirish Keskar, Bryan McCann, Lav R. Varshney, Caiming Xiong, and Richard Socher. Ctrl: A conditional transformer language model for controllable generation. arXiv preprint, abs\/1909.05858, 2019."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.698"},{"key":"e_1_3_2_1_31_1","volume-title":"International Conference on Learning Representations","author":"Chan Alvin","year":"2021","unstructured":"Alvin Chan, Yew-Soon Ong, Bill Pung, Aston Zhang, and Jie Fu. Cocon: A self-supervised approach for controlled text generation. In International Conference on Learning Representations, 2021."},{"key":"e_1_3_2_1_32_1","volume-title":"Plug and play language models: A simple approach to controlled text generation. arXiv preprint, abs\/1912.02164","author":"Dathathri Sumanth","year":"2019","unstructured":"Sumanth Dathathri, Andrea Madotto, Janice Lan, Jane Hung, Eric Frank, Piero Molino, Jason Yosinski, and Rosanne Liu. Plug and play language models: A simple approach to controlled text generation. arXiv preprint, abs\/1912.02164, 2019."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-emnlp.424"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.522"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.276"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1609\/aiide.v17i1.18891"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-emnlp.334"},{"key":"e_1_3_2_1_38_1","first-page":"6000","volume-title":"International Conference on Neural Information Processing Systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. Attention is all you need. In International Conference on Neural Information Processing Systems, page 6000--6010, 2017."},{"key":"e_1_3_2_1_39_1","first-page":"1","volume-title":"Journal of Machine Learning Research","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J. Liu. Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of Machine Learning Research, pages 1--67, 2020."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.703"},{"key":"e_1_3_2_1_41_1","first-page":"2121","volume-title":"International Conference on Neural Information Processing Systems","author":"Frome Andrea","year":"2013","unstructured":"Andrea Frome, Greg S. Corrado, Jonathon Shlens, Samy Bengio, Jeffrey Dean, Marc'Aurelio Ranzato, and Tomas Mikolov. DeViSE: A deep visual-semantic embedding model. In International Conference on Neural Information Processing Systems, page 2121--2129, 2013."},{"key":"e_1_3_2_1_42_1","first-page":"67","volume-title":"European Conference on Computer Vision","author":"Joulin Armand","year":"2016","unstructured":"Armand Joulin, Laurens van der Maaten, Allan Jabri, and Nicolas Vasilache. Learning visual features from large weakly supervised data. In Bastian Leibe, Jiri Matas, Nicu Sebe, and Max Welling, editors, European Conference on Computer Vision, pages 67--84, 2016"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.218"},{"key":"e_1_3_2_1_44_1","first-page":"4193","volume-title":"IEEE International Conference on Computer Vision","author":"Li Ang","year":"2017","unstructured":"Ang Li, Allan Jabri, Armand Joulin, and Laurens van der Maaten. Learning visual n-grams from web data. In IEEE International Conference on Computer Vision, pages 4193--4202, 2017."},{"key":"e_1_3_2_1_45_1","volume-title":"International Conference on Machine Learning","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven C. H. Hoi. BLIP: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International Conference on Machine Learning, 2022."},{"key":"e_1_3_2_1_46_1","volume-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint, abs\/2301.12597","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint, abs\/2301.12597, 2023."},{"key":"e_1_3_2_1_47_1","first-page":"21548","volume":"35","author":"Su Yixuan","year":"2022","unstructured":"Yixuan Su, Tian Lan, Yan Wang, Dani Yogatama, Lingpeng Kong, and Nigel Collier. A contrastive framework for neural text generation. In S. Koyejo, S. Mohamed, A. Agarwal, D. Belgrave, K. Cho, and A. Oh, editors, Advances in Neural Information Processing Systems, volume 35, pages 21548--21561, 2022.","journal-title":"editors, Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_48_1","first-page":"11213","volume-title":"AAAI Conference on Artificial Intelligence","author":"Jung Yunjae","year":"2020","unstructured":"Yunjae Jung, Dahun Kim, Sanghyun Woo, Kyungsu Kim, Sungjin Kim, and In-So Kweon. Hide-and-tell: Learning to bridge photo streams for visual storytelling. In AAAI Conference on Artificial Intelligence, pages 11213--11220, 2020."},{"key":"e_1_3_2_1_49_1","volume-title":"Language models can see: Plugging visual controls in text generation. arXiv preprint, abs\/2205.02655","author":"Su Yixuan","year":"2022","unstructured":"Yixuan Su, Tian Lan, Yahui Liu, Fangyu Liu, Dani Yogatama, Yan Wang, Lingpeng Kong, and Nigel Collier. Language models can see: Plugging visual controls in text generation. arXiv preprint, abs\/2205.02655, 2022."},{"key":"e_1_3_2_1_50_1","first-page":"65","volume-title":"the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee and Alon Lavie. METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization, pages 65--72, 2005."},{"key":"e_1_3_2_1_51_1","first-page":"311","volume-title":"Annual Meeting of the Association for Computational Linguistics","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. Bleu: a method for automatic evaluation of machine translation. In Annual Meeting of the Association for Computational Linguistics, pages 311--318, 2002."},{"key":"e_1_3_2_1_52_1","first-page":"74","volume-title":"Text Summarization Branches Out","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. ROUGE: A package for automatic evaluation of summaries. In Text Summarization Branches Out, pages 74--81, 2004."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1082"},{"key":"e_1_3_2_1_55_1","volume-title":"The curious case of neural text degeneration","author":"Holtzman Ari","year":"2020","unstructured":"Ari Holtzman, Jan Buys, Maxwell Forbes, and Yejin Choi. The curious case of neural text degeneration. 2020."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N16-1098"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N16-1014"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612179","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612179","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:06:15Z","timestamp":1755821175000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612179"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":57,"alternative-id":["10.1145\/3581783.3612179","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612179","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}