{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,11]],"date-time":"2026-06-11T16:13:05Z","timestamp":1781194385101,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":30,"publisher":"ACM","funder":[{"name":"Key-Area Research and Development Program of Guangdong Province","award":["2019B111101001"],"award-info":[{"award-number":["2019B111101001"]}]},{"name":"Science and Technology on Information System Engineering Laboratory","award":["WDZC2020525-0410"],"award-info":[{"award-number":["WDZC2020525-0410"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,12,9]]},"DOI":"10.1145\/3743093.3770956","type":"proceedings-article","created":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T08:06:16Z","timestamp":1765008376000},"page":"1-7","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["OPCap:Object-aware Prompting Captioning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-5328-0570","authenticated-orcid":false,"given":"Feiyang","family":"Huang","sequence":"first","affiliation":[{"name":"South China Normal University, Guangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4758-6091","authenticated-orcid":false,"given":"Yang","family":"Cao","sequence":"additional","affiliation":[{"name":"South China Normal University, Guangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-6134-4313","authenticated-orcid":false,"given":"Jingyue","family":"Zhong","sequence":"additional","affiliation":[{"name":"South China Normal University, Guangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,12,6]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00904"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"e_1_3_3_1_6_2","first-page":"65","volume-title":"Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65\u201372."},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"crossref","unstructured":"Ali\u00a0Furkan Biten Llu\u00eds\u00a0G\u00f3mez i Bigorda and Dimosthenis Karatzas. 2021. Let there be a clock on the beach: Reducing Object Hallucination in Image Captioning. 2022 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV) (2021) 2473\u20132482. https:\/\/api.semanticscholar.org\/CorpusID:238354129","DOI":"10.1109\/WACV51458.2022.00253"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","unstructured":"Shan Cao Gaoyun An Zhenxing Zheng and Zhiyong Wang. 2022. Vision-Enhanced and Consensus-Aware Transformer for Image Captioning. IEEE Transactions on Circuits and Systems for Video Technology 32 (2022) 7005\u20137018. 10.1109\/TCSVT.2022.3178844","DOI":"10.1109\/TCSVT.2022.3178844"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-1423"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671470"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00646"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","unstructured":"Sepp Hochreiter and J\u00fcrgen Schmidhuber. 1997. Long Short-Term Memory. Neural Comput. 9 8 (Nov. 1997) 1735\u20131780. 10.1162\/neco.1997.9.8.1735","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00473"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00473"},{"key":"e_1_3_3_1_15_2","first-page":"12888","volume-title":"International conference on machine learning","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888\u201312900."},{"key":"e_1_3_3_1_16_2","unstructured":"Tsung-Yi Lin Michael Maire Serge Belongie Lubomir Bourdev Ross Girshick James Hays Pietro Perona Deva Ramanan C.\u00a0Lawrence Zitnick and Piotr Doll\u00e1r. 2015. Microsoft COCO: Common Objects in Context. arxiv:https:\/\/arXiv.org\/abs\/1405.0312\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/1405.0312"},{"key":"e_1_3_3_1_17_2","unstructured":"Ron Mokady Amir Hertz and Amit\u00a0H. Bermano. 2021. ClipCap: CLIP Prefix for Image Captioning. arxiv:https:\/\/arXiv.org\/abs\/2111.09734\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2111.09734"},{"key":"e_1_3_3_1_18_2","first-page":"311","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311\u2013318."},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00856"},{"key":"e_1_3_3_1_20_2","unstructured":"Alec Radford Jong\u00a0Wook Kim Chris Hallacy Aditya Ramesh Gabriel Goh Sandhini Agarwal Girish Sastry Amanda Askell Pamela Mishkin Jack Clark Gretchen Krueger and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. arxiv:https:\/\/arXiv.org\/abs\/2103.00020\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2103.00020"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"crossref","unstructured":"Anna Rohrbach Lisa\u00a0Anne Hendricks Kaylee Burns Trevor Darrell and Kate Saenko. 2018. Object hallucination in image captioning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1809.02156 (2018).","DOI":"10.18653\/v1\/D18-1437"},{"key":"e_1_3_3_1_22_2","unstructured":"Anna Rohrbach Lisa\u00a0Anne Hendricks Kaylee Burns Trevor Darrell and Kate Saenko. 2019. Object Hallucination in Image Captioning. arxiv:https:\/\/arXiv.org\/abs\/1809.02156\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/1809.02156"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","unstructured":"Xiangqing Shen Bing Liu Yong Zhou and Jiaqi Zhao. 2020. Remote sensing image caption generation via transformer and reinforcement learning. Multimedia Tools and Applications 79 (2020) 26661 \u2013 26682. 10.1007\/s11042-020-09294-7","DOI":"10.1007\/s11042-020-09294-7"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.5555\/3295222.3295349"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"e_1_3_3_1_27_2","unstructured":"Jianfeng Wang Zhengyuan Yang Xiaowei Hu Linjie Li Kevin Lin Zhe Gan Zicheng Liu Ce Liu and Lijuan Wang. 2022. GIT: A Generative Image-to-text Transformer for Vision and Language. arxiv:https:\/\/arXiv.org\/abs\/2205.14100\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2205.14100"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-53302-0_3"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","unstructured":"Tiantao Xian Zhixin Li Zhenjun Tang and Huifang Ma. 2022. Adaptive Path Selection for Dynamic Image Captioning. IEEE Transactions on Circuits and Systems for Video Technology 32 (2022) 5762\u20135775. 10.1109\/TCSVT.2022.3155795","DOI":"10.1109\/TCSVT.2022.3155795"},{"key":"e_1_3_3_1_30_2","series-title":"Proceedings of Machine Learning Research","first-page":"2048","volume-title":"Proceedings of the 32nd International Conference on Machine Learning","volume":"37","author":"Xu Kelvin","year":"2015","unstructured":"Kelvin Xu, Jimmy Ba, Ryan Kiros, Kyunghyun Cho, Aaron Courville, Ruslan Salakhudinov, Rich Zemel, and Yoshua Bengio. 2015. Show, Attend and Tell: Neural Image Caption Generation with Visual Attention. In Proceedings of the 32nd International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol.\u00a037), Francis Bach and David Blei (Eds.). PMLR, Lille, France, 2048\u20132057. https:\/\/proceedings.mlr.press\/v37\/xuc15.html"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"crossref","unstructured":"Jingyue Zhong Yang Cao Yina Zhu Jie Gong and Qiaosen Chen. 2023. Multi-channel weighted fusion for image captioning. The Visual Computer 39 12 (2023) 6115\u20136132.","DOI":"10.1007\/s00371-022-02716-7"}],"event":{"name":"MMAsia '25: ACM Multimedia Asia","location":"Kuala Lumpur Malaysia","acronym":"MMAsia '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 7th ACM International Conference on Multimedia in Asia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3743093.3770956","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T08:07:21Z","timestamp":1765008441000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3743093.3770956"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,6]]},"references-count":30,"alternative-id":["10.1145\/3743093.3770956","10.1145\/3743093"],"URL":"https:\/\/doi.org\/10.1145\/3743093.3770956","relation":{},"subject":[],"published":{"date-parts":[[2025,12,6]]},"assertion":[{"value":"2025-12-06","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}