{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T09:11:24Z","timestamp":1765357884388,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":18,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3762067","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:54:17Z","timestamp":1761375257000},"page":"14244-14249","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Event-Enriched Image Analysis Grand Challenge At ACM Multimedia 2025"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-0800-6884","authenticated-orcid":false,"given":"Thien-Phuc","family":"Tran","sequence":"first","affiliation":[{"name":"University of Science, VNU-HCM, Ho Chi Minh, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-3520-4624","authenticated-orcid":false,"given":"Minh-Quang","family":"Nguyen","sequence":"additional","affiliation":[{"name":"University of Science, VNU-HCM, Ho Chi Minh, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3046-3041","authenticated-orcid":false,"given":"Minh-Triet","family":"Tran","sequence":"additional","affiliation":[{"name":"University of Science, VNU-HCM, Ho Chi Minh, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0236-7992","authenticated-orcid":false,"given":"Tam V.","family":"Nguyen","sequence":"additional","affiliation":[{"name":"University of Dayton, Dayton, Ohio, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2906-0360","authenticated-orcid":false,"given":"Trong-Le","family":"Do","sequence":"additional","affiliation":[{"name":"University of Science, VNU-HCM, Ho Chi Minh, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4304-2334","authenticated-orcid":false,"given":"Duy-Nam","family":"Ly","sequence":"additional","affiliation":[{"name":"University of Science, VNU-HCM, Ho Chi Minh, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8537-1331","authenticated-orcid":false,"given":"Viet-Tham","family":"Huynh","sequence":"additional","affiliation":[{"name":"University of Science, VNU-HCM, Ho Chi Minh, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8297-5666","authenticated-orcid":false,"given":"Khanh-Duy","family":"Le","sequence":"additional","affiliation":[{"name":"University of Science, VNU-HCM, Ho Chi Minh, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5460-0229","authenticated-orcid":false,"given":"Mai-Khiem","family":"Tran","sequence":"additional","affiliation":[{"name":"University of Science, VNU-HCM, Ho Chi Minh, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7363-2610","authenticated-orcid":false,"given":"Trung-Nghia","family":"Le","sequence":"additional","affiliation":[{"name":"University of Science, VNU-HCM, Ho Chi Minh, Vietnam"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Jean-Baptiste Alayrac Jeff Donahue et al. 2022. Flamingo: a Visual Language Model for Few-Shot Learning. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01275"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.81"},{"key":"e_1_3_2_1_4_1","volume-title":"Ronan Le Bras, and Yejin Choi","author":"Hessel Jack","year":"2021","unstructured":"Jack Hessel, Ari Holtzman, Maxwell Forbes, Ronan Le Bras, and Yejin Choi. 2021. CLIPScore: A Reference-free Evaluation Metric for Image Captioning. In EMNLP."},{"key":"e_1_3_2_1_5_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning","volume":"139","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision. In Proceedings of the 38th International Conference on Machine Learning, Vol. 139."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.356"},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings of the 39th International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. In Proceedings of the 39th International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 162). PMLR, 12888-12900."},{"key":"e_1_3_2_1_8_1","volume-title":"Shafiq Joty, Caiming Xiong, and Steven C. H. Hoi.","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath R. Selvaraju, Akhilesh Deepak Gotmare, Shafiq Joty, Caiming Xiong, and Steven C. H. Hoi. 2021. Align Before Fuse: Vision and Language Representation Learning with Momentum Distillation. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01303"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3746027.3758264"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.naacl-long.162"},{"key":"e_1_3_2_1_13_1","first-page":"8748","volume-title":"Proceedings of the 38th International Conference on Machine Learning","volume":"139","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In Proceedings of the 38th International Conference on Machine Learning, Vol. 139. PMLR, 8748-8763."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.eacl-main.266"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_2_1_16_1","volume-title":"GIT: A Generative Image-to-text Transformer for Vision and Language. arXiv:2205.14100","author":"Wang Jianfeng","year":"2022","unstructured":"Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, and Lijuan Wang. 2022. GIT: A Generative Image-to-text Transformer for Vision and Language. arXiv:2205.14100 (2022)."},{"key":"e_1_3_2_1_17_1","volume-title":"International conference on machine learning. PMLR","author":"Xu Kelvin","year":"2015","unstructured":"Kelvin Xu, Jimmy Ba, Ryan Kiros, Kyunghyun Cho, Aaron Courville, Ruslan Salakhudinov, Rich Zemel, and Yoshua Bengio. 2015. Show, attend and tell: Neural image caption generation with visual attention. In International conference on machine learning. PMLR, 2048-2057."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00273"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3762067","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:45:34Z","timestamp":1765309534000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3762067"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":18,"alternative-id":["10.1145\/3746027.3762067","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3762067","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}