{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:00:13Z","timestamp":1765339213067,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":38,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3762039","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:55:00Z","timestamp":1761375300000},"page":"14264-14270","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["ReCap: Event-Aware Image Captioning with Article Retrieval and Semantic Gaussian Normalization"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-5150-9307","authenticated-orcid":false,"given":"Thinh-Phuc","family":"Nguyen","sequence":"first","affiliation":[{"name":"University of Science, VNU-HCM, Ho Chi Minh, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-7953-8942","authenticated-orcid":false,"given":"Thanh-Hai","family":"Nguyen","sequence":"additional","affiliation":[{"name":"University of Science, VNU-HCM, Ho Chi Minh, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-8746-3004","authenticated-orcid":false,"given":"Gia-Huy","family":"Dinh","sequence":"additional","affiliation":[{"name":"University of Science, VNU-HCM, Ho Chi Minh, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-2890-5741","authenticated-orcid":false,"given":"Lam-Huy","family":"Nguyen","sequence":"additional","affiliation":[{"name":"University of Science, VNU-HCM, Ho Chi Minh, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3046-3041","authenticated-orcid":false,"given":"Minh-Triet","family":"Tran","sequence":"additional","affiliation":[{"name":"University of Science, VNU-HCM, Ho Chi Minh, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7363-2610","authenticated-orcid":false,"given":"Trung-Nghia","family":"Le","sequence":"additional","affiliation":[{"name":"University of Science, VNU-HCM, Ho Chi Minh, Vietnam"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00583"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01275"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611987"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2023.3295776"},{"key":"e_1_3_2_1_5_1","volume-title":"Image captioning: Transforming objects into words. Advances in neural information processing systems","author":"Herdade Simao","year":"2019","unstructured":"Simao Herdade, Armin Kappeler, Kofi Boakye, and Joao Soares. 2019. Image captioning: Transforming objects into words. Advances in neural information processing systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3295748"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413576"},{"key":"e_1_3_2_1_8_1","unstructured":"Ronghang Hu Amanpreet Singh Trevor Darrell and Marcus Rohrbach. 2020b. Iterative Answer Prediction with Pointer-Augmented Multimodal Transformers for TextVQA. In CVPR."},{"key":"e_1_3_2_1_9_1","volume-title":"Thomas Prola, Isabel De La Torre Diez, Md Abdus Samad, Imran Ashraf, et al.","author":"Jamil Azhar","year":"2024","unstructured":"Azhar Jamil, Khalid Mahmood, Monica Gracia Villar, Thomas Prola, Isabel De La Torre Diez, Md Abdus Samad, Imran Ashraf, et al., 2024. Deep learning approaches for image captioning: Opportunities, challenges and future potential. IEEE Access (2024)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02319"},{"key":"e_1_3_2_1_11_1","volume-title":"Intelligent Computation in Big Data Era: International Conference of Young Computer Scientists","author":"Jiang Bo","year":"2015","unstructured":"Ting-ting Li, Bo Jiang, Zheng-zheng Tu, Bin Luo, and Jin Tang. 2015. Image matching using mutual k-nearest neighbor graph. In Intelligent Computation in Big Data Era: International Conference of Young Computer Scientists, Engineers and Educators, ICYCSEE 2015, Harbin, China, January 10-12, 2015. Proceedings. Springer, 276-283."},{"key":"e_1_3_2_1_12_1","volume-title":"Visual news: Benchmark and challenges in news image captioning. arXiv preprint arXiv:2010.03743","author":"Liu Fuxiao","year":"2020","unstructured":"Fuxiao Liu, Yinghan Wang, Tianlu Wang, and Vicente Ordonez. 2020. Visual news: Benchmark and challenges in news image captioning. arXiv preprint arXiv:2010.03743 (2020)."},{"key":"e_1_3_2_1_13_1","volume-title":"Image Captioning in news report scenario. arXiv preprint arXiv:2403.16209","author":"Liu Tianrui","year":"2024","unstructured":"Tianrui Liu, Qi Cai, Changxin Xu, Bo Hong, Jize Xiong, Yuxin Qiao, and Tsungwei Yang. 2024. Image Captioning in news report scenario. arXiv preprint arXiv:2403.16209 (2024)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3746027.3758264"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.coling-main.280"},{"key":"e_1_3_2_1_16_1","unstructured":"Maxime Oquab Timoth\u00e9e Darcet Theo Moutakanni Huy V. Vo Marc Szafraniec Vasil Khalidov Pierre Fernandez Daniel Haziza Francisco Massa Alaaeldin El-Nouby Russell Howes Po-Yao Huang Hu Xu Vasu Sharma Shang-Wen Li Wojciech Galuba Mike Rabbat Mido Assran Nicolas Ballas Gabriel Synnaeve Ishan Misra Herve Jegou Julien Mairal Patrick Labatut Armand Joulin and Piotr Bojanowski. 2023. DINOv2: Learning Robust Visual Features without Supervision."},{"key":"e_1_3_2_1_17_1","volume-title":"Visualcomet: Reasoning about the dynamic context of a still image. In Computer Vision-ECCV 2020: 16th European Conference","author":"Park Jae Sung","year":"2020","unstructured":"Jae Sung Park, Chandra Bhagavatula, Roozbeh Mottaghi, Ali Farhadi, and Yejin Choi. 2020. Visualcomet: Reasoning about the dynamic context of a still image. In Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, August 23-28, 2020, Proceedings, Part V 16. Springer, 508-524."},{"key":"e_1_3_2_1_18_1","volume-title":"Visually-aware context modeling for news image captioning. arXiv preprint arXiv:2308.08325","author":"Qu Tingyu","year":"2023","unstructured":"Tingyu Qu, Tinne Tuytelaars, and Marie-Francine Moens. 2023. Visually-aware context modeling for news image captioning. arXiv preprint arXiv:2308.08325 (2023)."},{"key":"e_1_3_2_1_19_1","volume-title":"International conference on machine learning. PmLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748-8763."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01013"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2011.02.008"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Oleg Sidorov Ronghang Hu Amanpreet Singh Marcus Rohrbach and Trevor Darrell. 2020. TextCaps: A Dataset for Image Captioning with Reading Comprehension. In ECCV.","DOI":"10.1007\/978-3-030-58536-5_44"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01189"},{"key":"e_1_3_2_1_24_1","unstructured":"Qwen Team. 2025. Qwen2.5-VL. https:\/\/qwenlm.github.io\/blog\/qwen2.5-vl\/"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01305"},{"key":"e_1_3_2_1_26_1","volume-title":"NYTimes800K: A Dataset for Image Captioning with Paragraph-Level Summaries. arXiv preprint arXiv:2201.12321","author":"Tran Duy-Kien","year":"2022","unstructured":"Duy-Kien Tran, Tien Bui, Khoa Tran, and Minh-Tien Nguyen. 2022. NYTimes800K: A Dataset for Image Captioning with Paragraph-Level Summaries. arXiv preprint arXiv:2201.12321 (2022)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3746027.3762067"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_2_1_29_1","volume-title":"Show and tell: Lessons learned from the 2015 mscoco image captioning challenge","author":"Vinyals Oriol","year":"2016","unstructured":"Oriol Vinyals, Alexander Toshev, Samy Bengio, and Dumitru Erhan. 2016. Show and tell: Lessons learned from the 2015 mscoco image captioning challenge. IEEE transactions on pattern analysis and machine intelligence, Vol. 39, 4 (2016), 652-663."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3426655"},{"key":"e_1_3_2_1_31_1","unstructured":"An Yang Anfeng Li Baosong Yang Beichen Zhang Binyuan Hui Bo Zheng Bowen Yu Chang Gao Chengen Huang Chenxu Lv et al. 2025. Qwen3 technical report. arXiv preprint arXiv:2505.09388 (2025)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2024.127823"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.503"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01337"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547883"},{"key":"e_1_3_2_1_36_1","volume-title":"Understanding image retrieval re-ranking: A graph neural network perspective. arXiv preprint arXiv:2012.07620","author":"Zhang Xuanmeng","year":"2020","unstructured":"Xuanmeng Zhang, Minyue Jiang, Zhedong Zheng, Xiao Tan, Errui Ding, and Yi Yang. 2020. Understanding image retrieval re-ranking: A graph neural network perspective. arXiv preprint arXiv:2012.07620 (2020)."},{"key":"e_1_3_2_1_37_1","volume-title":"Informative image captioning with external sources of information. arXiv preprint arXiv:1906.08876","author":"Zhao Sanqiang","year":"2019","unstructured":"Sanqiang Zhao, Piyush Sharma, Tomer Levinboim, and Radu Soricut. 2019. Informative image captioning with external sources of information. arXiv preprint arXiv:1906.08876 (2019)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3301279"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3762039","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T03:57:21Z","timestamp":1765339041000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3762039"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":38,"alternative-id":["10.1145\/3746027.3762039","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3762039","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}