{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,12]],"date-time":"2026-02-12T17:29:17Z","timestamp":1770917357060,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T00:00:00Z","timestamp":1717027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Nature Science Foundation of China","award":["No. 62376199"],"award-info":[{"award-number":["No. 62376199"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,30]]},"DOI":"10.1145\/3652583.3658112","type":"proceedings-article","created":{"date-parts":[[2024,6,7]],"date-time":"2024-06-07T06:30:40Z","timestamp":1717741840000},"page":"608-617","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["SamCap: Energy-based Controllable Image Captioning by Gradient-Based Sampling"],"prefix":"10.1145","author":[{"given":"Yuchen","family":"Niu","sequence":"first","affiliation":[{"name":"Department of Computer Science and Technology, Tongji University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-5007-9215","authenticated-orcid":false,"given":"Min","family":"Zhu","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Tongji University, Shanghai, China"}]},{"given":"Zhihua","family":"Wei","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Tongji University, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2024,6,7]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Advances in Neural Information Processing Systems","volume":"35","author":"Alayrac Jean-Baptiste","year":"2022","unstructured":"Jean-Baptiste Alayrac, Jeff Donahue, Pauline Luc, Antoine Miech, Iain Barr, Yana Hasson, Karel Lenc, Arthur Mensch, Katherine Millican, Malcolm Reynolds, Roman Ring, Eliza Rutherford, Serkan Cabi, Tengda Han, Zhitao Gong, Sina Samangooei, Marianne Monteiro, Jacob L. Menick, Sebastian Borgeaud, Andy Brock, Aida Nematzadeh, Sahand Sharifzadeh, Miko?aj Bi?kowski, Ricardo Barreira, Oriol Vinyals, Andrew Zisserman, and Kar\u00e9n Simonyan. 2022. Flamingo: a Visual Language Model for Few-Shot Learning. Advances in Neural Information Processing Systems , Vol. 35 (Dec. 2022), 23716--23736. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2022\/hash\/960a172bc7fbf0177ccccbb411a7d800-Abstract-Conference.html"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978--3--319--46454--1_24"},{"key":"e_1_3_2_1_3_1","unstructured":"Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An Automatic Metric for MT Evaluation with Improved Correlation with Human Judgments. In Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization Jade Goldstein Alon Lavie Chin-Yew Lin and Clare Voss (Eds.). Association for Computational Linguistics Ann Arbor Michigan 65--72. https:\/\/aclanthology.org\/W05-0909"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","unstructured":"Yoshua Bengio Nicholas L\u00e9onard and Aaron Courville. 2013. Estimating or Propagating Gradients Through Stochastic Neurons for Conditional Computation. https:\/\/doi.org\/10.48550\/arXiv.1308.3432 arXiv:1308.3432 [cs].","DOI":"10.48550\/arXiv.1308.3432"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01657"},{"key":"e_1_3_2_1_6_1","volume-title":"Advances in Neural Information Processing Systems","volume":"35","author":"Chen Qi","year":"2022","unstructured":"Qi Chen, Chaorui Deng, and Qi Wu. 2022. Learning Distinct and Representative Modes for Image Captioning. Advances in Neural Information Processing Systems , Vol. 35 (Dec. 2022), 9472--9485. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2022\/hash\/3d77c6dcc7f143aa2154e7f4d5e22d68-Abstract-Conference.html"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00998"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1007\/978--3-030-01249--6_32"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00850"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2023.103857"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1007\/978--3-030--58601-0_42"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01095"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"},{"key":"e_1_3_2_1_14_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly Jakob Uszkoreit and Neil Houlsby. 2020. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. https:\/\/openreview.net\/forum?id=YicbFdNTTy"},{"key":"e_1_3_2_1_15_1","volume-title":"Advances in Neural Information Processing Systems","volume":"32","author":"Du Yilun","year":"2019","unstructured":"Yilun Du and Igor Mordatch. 2019. Implicit Generation and Modeling with Energy Based Models. In Advances in Neural Information Processing Systems, Vol. 32. Curran Associates, Inc. https:\/\/papers.nips.cc\/paper\/2019\/hash\/378a063b8fdb1db941e34f4bde584c7d-Abstract.html"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01748"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.108"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00433"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"e_1_3_2_1_21_1","unstructured":"Ari Holtzman Jan Buys Li Du Maxwell Forbes and Yejin Choi. 2019. The Curious Case of Neural Text Degeneration. https:\/\/openreview.net\/forum?id=rygGQyrFvH"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01745"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"crossref","unstructured":"Andrej Karpathy and Li Fei-Fei. 2015. Deep Visual-Semantic Alignments for Generating Image Descriptions. 3128--3137. https:\/\/www.cv-foundation.org\/openaccess\/content_cvpr_2015\/html\/Karpathy_Deep_Visual-Semantic_Alignments_2015_CVPR_paper.html","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_2_1_24_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning. PMLR, 5583--5594","author":"Kim Wonjae","year":"2021","unstructured":"Wonjae Kim, Bokyung Son, and Ildoo Kim. 2021. ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision. In Proceedings of the 38th International Conference on Machine Learning. PMLR, 5583--5594. https:\/\/proceedings.mlr.press\/v139\/kim21k.html ISSN: 2640--3498."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.144"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Yann Lecun Sumit Chopra and Raia Hadsell. 2006. A tutorial on energy-based learning.","DOI":"10.7551\/mitpress\/7443.003.0014"},{"key":"e_1_3_2_1_27_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. In Proceedings of the 40th International Conference on Machine Learning. PMLR, 19730--19742. https:\/\/proceedings.mlr.press\/v202\/li23q.html ISSN: 2640--3498."},{"key":"e_1_3_2_1_28_1","volume-title":"Proceedings of the 39th International Conference on Machine Learning. PMLR, 12888--12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. In Proceedings of the 39th International Conference on Machine Learning. PMLR, 12888--12900. https:\/\/proceedings.mlr.press\/v162\/li22n.html ISSN: 2640--3498."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/978--3-030--58577--8_8"},{"key":"e_1_3_2_1_30_1","volume-title":"ROUGE: A Package for Automatic Evaluation of Summaries. In Text Summarization Branches Out","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. ROUGE: A Package for Automatic Evaluation of Summaries. In Text Summarization Branches Out. Association for Computational Linguistics, Barcelona, Spain, 74--81. https:\/\/aclanthology.org\/W04--1013"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/978--3--319--10602--1_48"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","unstructured":"Annika Lindh Robert Ross and John Kelleher. 2020. Language-Driven Region Pointer Advancement for Controllable Image Captioning. In Proceedings of the 28th International Conference on Computational Linguistics Donia Scott Nuria Bel and Chengqing Zong (Eds.). International Committee on Computational Linguistics Barcelona Spain (Online) 1922--1935. https:\/\/doi.org\/10.18653\/v1\/2020.coling-main.174","DOI":"10.18653\/v1\/2020.coling-main.174"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.150"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","unstructured":"Xin Liu Muhammad Khalifa and Lu Wang. 2023. BOLT: Fast Energy-based Controlled Text Generation with Tunable Biases. In Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers) Anna Rogers Jordan Boyd-Graber and Naoaki Okazaki (Eds.). Association for Computational Linguistics Toronto Canada 186--200. https:\/\/doi.org\/10.18653\/v1\/2023.acl-short.18","DOI":"10.18653\/v1\/2023.acl-short.18"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v30i1.10475"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.303"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1007\/978--3-030--58558--7_38"},{"key":"e_1_3_2_1_39_1","volume-title":"Advances in Neural Information Processing Systems","volume":"35","author":"Qin Lianhui","year":"2022","unstructured":"Lianhui Qin, Sean Welleck, Daniel Khashabi, and Yejin Choi. 2022. COLD Decoding: Energy-based Constrained Text Generation with Langevin Dynamics. Advances in Neural Information Processing Systems , Vol. 35 (Dec. 2022), 9538--9551. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2022\/hash\/3e25d1aff47964c8409fd5c8dc0438d7-Abstract-Conference.html"},{"key":"e_1_3_2_1_40_1","volume-title":"Advances in Neural Information Processing Systems","volume":"28","author":"Ren Shaoqing","year":"2015","unstructured":"Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. 2015. Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks. In Advances in Neural Information Processing Systems, Vol. 28. Curran Associates, Inc. https:\/\/proceedings.neurips.cc\/paper\/2015\/hash\/14bfa6bb14875e45bba028a21ed38046-Abstract.html"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01280"},{"key":"e_1_3_2_1_42_1","volume-title":"Advances in Neural Information Processing Systems","volume":"32","author":"Song Yang","year":"2019","unstructured":"Yang Song and Stefano Ermon. 2019. Generative Modeling by Estimating Gradients of the Data Distribution. In Advances in Neural Information Processing Systems, Vol. 32. Curran Associates, Inc. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2019\/hash\/3001ef257407d5a371a96dcd947c7d93-Abstract.html"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01739"},{"key":"e_1_3_2_1_44_1","volume-title":"Advances in Neural Information Processing Systems","volume":"30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, ?ukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems, Vol. 30. Curran Associates, Inc. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2017\/hash\/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i2.25360"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","unstructured":"Zhen Wang Jun Xiao Yueting Zhuang Fei Gao Jian Shao and Long Chen. 2023 a. Learning Combinatorial Prompts for Universal Controllable Image Captioning. https:\/\/doi.org\/10.48550\/arXiv.2303.06338 arXiv:2303.06338 [cs].","DOI":"10.48550\/arXiv.2303.06338"},{"key":"e_1_3_2_1_48_1","volume-title":"Proceedings of the 28th International Conference on International Conference on Machine Learning (ICML'11)","author":"Welling Max","year":"2011","unstructured":"Max Welling and Yee Whye Teh. 2011. Bayesian learning via stochastic gradient langevin dynamics. In Proceedings of the 28th International Conference on International Conference on Machine Learning (ICML'11). Omnipress, Madison, WI, USA, 681--688."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.157"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.276"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2889922"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02247"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.2976552"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6998"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00859"}],"event":{"name":"ICMR '24: International Conference on Multimedia Retrieval","location":"Phuket Thailand","acronym":"ICMR '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia","SIGSOFT ACM Special Interest Group on Software Engineering"]},"container-title":["Proceedings of the 2024 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658112","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3652583.3658112","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T08:52:51Z","timestamp":1755766371000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658112"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,30]]},"references-count":56,"alternative-id":["10.1145\/3652583.3658112","10.1145\/3652583"],"URL":"https:\/\/doi.org\/10.1145\/3652583.3658112","relation":{},"subject":[],"published":{"date-parts":[[2024,5,30]]},"assertion":[{"value":"2024-06-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}