{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:09:14Z","timestamp":1765339754373,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":15,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3762090","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:54:15Z","timestamp":1761375255000},"page":"14250-14256","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["ENRIC: EveNt-AwaRe Captioning with Image Retrieval via UnCertainty-Guided Re-ranking and Semantic Ensemble Reasoning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-5872-5982","authenticated-orcid":false,"given":"Nam-Quan","family":"Nguyen","sequence":"first","affiliation":[{"name":"University of Science, VNU-HCM, Ho Chi Minh City, Vietnam and Vietnam National University, Ho Chi Minh City, Vietnam"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-1501-8080","authenticated-orcid":false,"given":"Minh-Hoang","family":"Le","sequence":"additional","affiliation":[{"name":"University of Science, VNU-HCM, Ho Chi Minh City, Vietnam and Vietnam National University, Ho Chi Minh City, Vietnam"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-4873-3258","authenticated-orcid":false,"given":"Vinh-Toan","family":"Vong","sequence":"additional","affiliation":[{"name":"University of Science, VNU-HCM, Ho Chi Minh City, Vietnam and Vietnam National University, Ho Chi Minh City, Vietnam"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3046-3041","authenticated-orcid":false,"given":"Minh-Triet","family":"Tran","sequence":"additional","affiliation":[{"name":"University of Science, VNU-HCM, Ho Chi Minh City, Vietnam and Vietnam National University, Ho Chi Minh City, Vietnam"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"volume-title":"EVENTA Grand Challenge - ACM Multimedia","year":"2025","key":"e_1_3_2_1_1_1","unstructured":"2025. EVENTA Grand Challenge - ACM Multimedia 2025. (2025). https:\/\/ltnghia.github.io\/eventa\/"},{"key":"e_1_3_2_1_2_1","unstructured":"AI@Meta. 2024. Llama 3 Model Card. (2024). https:\/\/github.com\/meta-llama\/llama3\/blob\/main\/MODEL_CARD.md"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Ali Furkan Biten Lluis Gomez Mar\u00e7al Rusi\u00f1ol and Dimosthenis Karatzas. 2019. Good News Everyone! Context driven entity-aware captioning for news images. arXiv:1904.01475 [cs.CV] https:\/\/arxiv.org\/abs\/1904.01475","DOI":"10.1109\/CVPR.2019.01275"},{"key":"e_1_3_2_1_4_1","unstructured":"Zhe Chen Weiyun Wang Yue Cao Yangzhou Liu Zhangwei Gao Erfei Cui Jinguo Zhu Shenglong Ye Hao Tian Zhaoyang Liu et al. 2024. Expanding Performance Boundaries of Open-Source Multimodal Models with Model Data and Test-Time Scaling. arXiv preprint arXiv:2412.05271 (2024)."},{"key":"e_1_3_2_1_5_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 24185--24198","author":"Chen Zhe","year":"2024","unstructured":"Zhe Chen, Jiannan Wu, Wenhai Wang, Weijie Su, Guo Chen, Sen Xing, Muyan Zhong, Qinglong Zhang, Xizhou Zhu, Lewei Lu, et al. 2024. Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 24185--24198."},{"key":"e_1_3_2_1_6_1","unstructured":"Aaron Grattafiori Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Alex Vaughan et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/tip.2020.3043875"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Di Lu Spencer Whitehead Lifu Huang Heng Ji and Shih-Fu Chang. 2018. Entity-aware Image Caption Generation. arXiv:1804.07889 [cs.CL] https:\/\/arxiv.org\/abs\/1804.07889","DOI":"10.18653\/v1\/D18-1435"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3746027.3758264"},{"key":"e_1_3_2_1_10_1","unstructured":"Arnau Ramisa Fei Yan Francesc Moreno-Noguer and Krystian Mikolajczyk. 2016. BreakingNews: Article Annotation by Image and Text Processing. arXiv:1603.07141 [cs.CV] https:\/\/arxiv.org\/abs\/1603.07141"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"Christopher Thomas and Adriana Kovashka. 2020. Preserving Semantic Neighborhoods for Robust Cross-modal Retrieval. arXiv:2007.08617 [cs.CV] https:\/\/arxiv.org\/abs\/2007.08617","DOI":"10.1007\/978-3-030-58523-5_19"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Alasdair Tran Alexander Mathews and Lexing Xie. 2020. Transform and Tell: Entity-Aware News Image Captioning. arXiv:2004.08070 [cs.CV] https:\/\/arxiv.org\/abs\/2004.08070","DOI":"10.1109\/CVPR42600.2020.01305"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_2_1_14_1","unstructured":"Haoran Wang Ying Zhang Zhong Ji Yanwei Pang and Lin Ma. 2021. Consensus-Aware Visual-Semantic Embedding for Image-Text Matching. arXiv:2007.08883 [cs.CV] https:\/\/arxiv.org\/abs\/2007.08883"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3340531.3412081"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3762090","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:04:28Z","timestamp":1765339468000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3762090"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":15,"alternative-id":["10.1145\/3746027.3762090","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3762090","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}