{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,1]],"date-time":"2025-12-01T11:25:44Z","timestamp":1764588344540,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":29,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,10,10]],"date-time":"2022-10-10T00:00:00Z","timestamp":1665360000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61976207"],"award-info":[{"award-number":["61976207"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"National Natural Science Foundation of China","award":["61906187"],"award-info":[{"award-number":["61906187"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,10,10]]},"DOI":"10.1145\/3503161.3548295","type":"proceedings-article","created":{"date-parts":[[2022,10,10]],"date-time":"2022-10-10T15:42:46Z","timestamp":1665416566000},"page":"4733-4741","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["Detach and Attach: Stylized Image Captioning without Paired Stylized Dataset"],"prefix":"10.1145","author":[{"given":"Yutong","family":"Tan","sequence":"first","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zheng","family":"Lin","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Peng","family":"Fu","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mingyu","family":"Zheng","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lanrui","family":"Wang","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yanan","family":"Cao","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Weipinng","family":"Wang","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2022,10,10]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Spice: Semantic propositional image caption evaluation. In ECCV. 382--398.","author":"Anderson Peter","year":"2016","unstructured":"Peter Anderson , Basura Fernando , Mark Johnson , and Stephen Gould . 2016 . Spice: Semantic propositional image caption evaluation. In ECCV. 382--398. Peter Anderson, Basura Fernando, Mark Johnson, and Stephen Gould. 2016. Spice: Semantic propositional image caption evaluation. In ECCV. 382--398."},{"key":"e_1_3_2_2_2_1","volume-title":"METEOR: An Automatic Metric for MT Evaluation with Improved Correlation with Human Judgments. In IEEvaluation@ACL.","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee and Alon Lavie . 2005 . METEOR: An Automatic Metric for MT Evaluation with Improved Correlation with Human Judgments. In IEEvaluation@ACL. Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An Automatic Metric for MT Evaluation with Improved Correlation with Human Judgments. In IEEvaluation@ACL."},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018151"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"crossref","unstructured":"Tianlang Chen Zhongping Zhang Quanzeng You Chen Fang Zhaowen Wang Hailin Jin and Jiebo Luo. 2018. ?Factual\"or?Emotional\": Stylized Image Captioning with Adaptive Learning and Attention. In ECCV. 519--535. Tianlang Chen Zhongping Zhang Quanzeng You Chen Fang Zhaowen Wang Hailin Jin and Jiebo Luo. 2018. ?Factual\"or?Emotional\": Stylized Image Captioning with Adaptive Learning and Attention. In ECCV. 519--535.","DOI":"10.1007\/978-3-030-01249-6_32"},{"key":"e_1_3_2_2_5_1","volume-title":"Microsoft coco captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325","author":"Chen Xinlei","year":"2015","unstructured":"Xinlei Chen , Hao Fang , Tsung-Yi Lin , Ramakrishna Vedantam , Saurabh Gupta , Piotr Doll\u00e1r , and C Lawrence Zitnick . 2015. Microsoft coco captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325 ( 2015 ). Xinlei Chen, Hao Fang, Tsung-Yi Lin, Ramakrishna Vedantam, Saurabh Gupta, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2015. Microsoft coco captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325 (2015)."},{"key":"e_1_3_2_2_6_1","volume-title":"Stylenet: Generating attractive visual captions with styles. In CVPR. 3137--3146.","author":"Gan Chuang","year":"2017","unstructured":"Chuang Gan , Zhe Gan , Xiaodong He , Jianfeng Gao , and Li Deng . 2017 . Stylenet: Generating attractive visual captions with styles. In CVPR. 3137--3146. Chuang Gan, Zhe Gan, Xiaodong He, Jianfeng Gao, and Li Deng. 2017. Stylenet: Generating attractive visual captions with styles. In CVPR. 3137--3146."},{"key":"e_1_3_2_2_7_1","unstructured":"Longteng Guo Jing Liu Peng Yao Jiangwei Li and Hanqing Lu. 2019. MSCap: Multi-Style Image Captioning With Unpaired Stylized Text. In CVPR. 4204--4213. Longteng Guo Jing Liu Peng Yao Jiangwei Li and Hanqing Lu. 2019. MSCap: Multi-Style Image Captioning With Unpaired Stylized Text. In CVPR. 4204--4213."},{"key":"e_1_3_2_2_8_1","volume-title":"Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers). 690--696","author":"Heafield Kenneth","year":"2013","unstructured":"Kenneth Heafield , Ivan Pouzyrevsky , Jonathan H Clark , and Philipp Koehn . 2013 . Scalable modified Kneser-Ney language model estimation . In Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers). 690--696 . Kenneth Heafield, Ivan Pouzyrevsky, Jonathan H Clark, and Philipp Koehn. 2013. Scalable modified Kneser-Ney language model estimation. In Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers). 690--696."},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"crossref","unstructured":"Vineet John Lili Mou Hareesh Bahuleyan and Olga Vechtomova. 2019. Disentangled Representation Learning for Non-Parallel Text Style Transfer. In ACL. 424--434. Vineet John Lili Mou Hareesh Bahuleyan and Olga Vechtomova. 2019. Disentangled Representation Learning for Non-Parallel Text Style Transfer. In ACL. 424--434.","DOI":"10.18653\/v1\/P19-1041"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2598339"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"crossref","unstructured":"Yoon Kim. 2014. Convolutional Neural Networks for Sentence Classification. In EMNLP. Yoon Kim. 2014. Convolutional Neural Networks for Sentence Classification. In EMNLP.","DOI":"10.3115\/v1\/D14-1181"},{"key":"e_1_3_2_2_12_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba . 2014 . Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014). Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_2_13_1","volume-title":"arXiv preprint arXiv:1506.06726","author":"Kiros Ryan","year":"2015","unstructured":"Ryan Kiros , Yukun Zhu , Ruslan Salakhutdinov , Richard S Zemel , Antonio Torralba , Raquel Urtasun , and Sanja Fidler . 2015. Skip-Thought Vectors . arXiv preprint arXiv:1506.06726 ( 2015 ). Ryan Kiros, Yukun Zhu, Ruslan Salakhutdinov, Richard S Zemel, Antonio Torralba, Raquel Urtasun, and Sanja Fidler. 2015. Skip-Thought Vectors. arXiv preprint arXiv:1506.06726 (2015)."},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475662"},{"key":"e_1_3_2_2_15_1","volume-title":"Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74--81.","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin . 2004 . Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74--81. Chin-Yew Lin. 2004. Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74--81."},{"key":"e_1_3_2_2_16_1","volume-title":"Semstyle: Learning to generate stylised image captions using unaligned text. In CVPR. 8591--8600.","author":"Mathews Alexander","year":"2018","unstructured":"Alexander Mathews , Lexing Xie , and Xuming He . 2018 . Semstyle: Learning to generate stylised image captions using unaligned text. In CVPR. 8591--8600. Alexander Mathews, Lexing Xie, and Xuming He. 2018. Semstyle: Learning to generate stylised image captions using unaligned text. In CVPR. 8591--8600."},{"key":"e_1_3_2_2_17_1","volume-title":"Senticap: Generating image descriptions with sentiments. In AAAI.","author":"Mathews Alexander Patrick","year":"2016","unstructured":"Alexander Patrick Mathews , Lexing Xie , and Xuming He . 2016 . Senticap: Generating image descriptions with sentiments. In AAAI. Alexander Patrick Mathews, Lexing Xie, and Xuming He. 2016. Senticap: Generating image descriptions with sentiments. In AAAI."},{"key":"e_1_3_2_2_18_1","volume-title":"Senti-Attend: Image Captioning using Sentiment and Attention. arXiv preprint arXiv:1811.09789","author":"Nezami Omid Mohamad","year":"2018","unstructured":"Omid Mohamad Nezami , Mark Dras , Stephen Wan , and Cecile Paris . 2018. Senti-Attend: Image Captioning using Sentiment and Attention. arXiv preprint arXiv:1811.09789 ( 2018 ). Omid Mohamad Nezami, Mark Dras, Stephen Wan, and Cecile Paris. 2018. Senti-Attend: Image Captioning using Sentiment and Attention. arXiv preprint arXiv:1811.09789 (2018)."},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"crossref","unstructured":"Omid Mohamad Nezami Mark Dras Stephen Wan C\u00e9cile Paris and Len Hamey. 2019. Towards Generating Stylized Image Captions via Adversarial Training. In PRICAI. 270--284. Omid Mohamad Nezami Mark Dras Stephen Wan C\u00e9cile Paris and Len Hamey. 2019. Towards Generating Stylized Image Captions via Adversarial Training. In PRICAI. 270--284.","DOI":"10.1007\/978-3-030-29908-8_22"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"crossref","unstructured":"Kishore Papineni Salim Roukos Todd Ward and Wei-Jing Zhu. 2002. BLEU: a method for automatic evaluation of machine translation. In ACL. 311--318. Kishore Papineni Salim Roukos Todd Ward and Wei-Jing Zhu. 2002. BLEU: a method for automatic evaluation of machine translation. In ACL. 311--318.","DOI":"10.3115\/1073083.1073135"},{"key":"e_1_3_2_2_21_1","unstructured":"Shaoqing Ren Kaiming He Ross Girshick and Jian Sun. 2015. Faster r-cnn: Towards real-time object detection with region proposal networks. In Advances in neural information processing systems. 91--99. Shaoqing Ren Kaiming He Ross Girshick and Jian Sun. 2015. Faster r-cnn: Towards real-time object detection with region proposal networks. In Advances in neural information processing systems. 91--99."},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"crossref","unstructured":"Andreas Stolcke. 2002. SRILM - an extensible language modeling toolkit. In INTERSPEECH. Andreas Stolcke. 2002. SRILM - an extensible language modeling toolkit. In INTERSPEECH.","DOI":"10.21437\/ICSLP.2002-303"},{"key":"e_1_3_2_2_23_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N. Gomez Lukasz Kaiser and Illia Polosukhin. 2017. Attention is All you Need. In NIPS. Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N. Gomez Lukasz Kaiser and Illia Polosukhin. 2017. Attention is All you Need. In NIPS."},{"key":"e_1_3_2_2_24_1","volume-title":"CIDEr: Consensus-based image description evaluation. CVPR","author":"Vedantam Ramakrishna","year":"2014","unstructured":"Ramakrishna Vedantam , C. Lawrence Zitnick , and Devi Parikh . 2014. CIDEr: Consensus-based image description evaluation. CVPR ( 2014 ), 4566--4575. Ramakrishna Vedantam, C. Lawrence Zitnick, and Devi Parikh. 2014. CIDEr: Consensus-based image description evaluation. CVPR (2014), 4566--4575."},{"key":"e_1_3_2_2_25_1","volume-title":"Unpaired sentiment-to-sentiment translation: A cycled reinforcement learning approach. arXiv preprint arXiv:1805.05181","author":"Xu Jingjing","year":"2018","unstructured":"Jingjing Xu , Xu Sun , Qi Zeng , Xuancheng Ren , Xiaodong Zhang , Houfeng Wang , and Wenjie Li. 2018. Unpaired sentiment-to-sentiment translation: A cycled reinforcement learning approach. arXiv preprint arXiv:1805.05181 ( 2018 ). Jingjing Xu, Xu Sun, Qi Zeng, Xuancheng Ren, Xiaodong Zhang, Houfeng Wang, and Wenjie Li. 2018. Unpaired sentiment-to-sentiment translation: A cycled reinforcement learning approach. arXiv preprint arXiv:1805.05181 (2018)."},{"key":"e_1_3_2_2_26_1","volume-title":"Image captioning at will: A versatile scheme for effectively injecting sentiments into image descriptions. arXiv preprint arXiv:1801.10121","author":"You Quanzeng","year":"2018","unstructured":"Quanzeng You , Hailin Jin , and Jiebo Luo . 2018. Image captioning at will: A versatile scheme for effectively injecting sentiments into image descriptions. arXiv preprint arXiv:1801.10121 ( 2018 ). Quanzeng You, Hailin Jin, and Jiebo Luo. 2018. Image captioning at will: A versatile scheme for effectively injecting sentiments into image descriptions. arXiv preprint arXiv:1801.10121 (2018)."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6998"},{"key":"e_1_3_2_2_28_1","volume-title":"Sketch Storytelling. In ICASSP 2022--2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 4748--4752","author":"Zhou Yucheng","year":"2022","unstructured":"Yucheng Zhou . 2022 . Sketch Storytelling. In ICASSP 2022--2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 4748--4752 . Yucheng Zhou. 2022. Sketch Storytelling. In ICASSP 2022--2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 4748--4752."},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"crossref","unstructured":"Yukun Zhu Ryan Kiros Richard S Zemel Ruslan Salakhutdinov Raquel Urtasun Antonio Torralba and Sanja Fidler. 2015. Aligning Books and Movies: Towards Story-Like Visual Explanations by Watching Movies and Reading Books. In ICCV. Yukun Zhu Ryan Kiros Richard S Zemel Ruslan Salakhutdinov Raquel Urtasun Antonio Torralba and Sanja Fidler. 2015. Aligning Books and Movies: Towards Story-Like Visual Explanations by Watching Movies and Reading Books. In ICCV.","DOI":"10.1109\/ICCV.2015.11"}],"event":{"name":"MM '22: The 30th ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Lisboa Portugal","acronym":"MM '22"},"container-title":["Proceedings of the 30th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503161.3548295","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3503161.3548295","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:00:43Z","timestamp":1750186843000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503161.3548295"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,10,10]]},"references-count":29,"alternative-id":["10.1145\/3503161.3548295","10.1145\/3503161"],"URL":"https:\/\/doi.org\/10.1145\/3503161.3548295","relation":{},"subject":[],"published":{"date-parts":[[2022,10,10]]},"assertion":[{"value":"2022-10-10","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}