{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:00:02Z","timestamp":1765339202908,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":36,"publisher":"ACM","funder":[{"name":"Vietnam National University - Ho Chi Minh City","award":["C2024-18-25"],"award-info":[{"award-number":["C2024-18-25"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3762038","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:55:00Z","timestamp":1761375300000},"page":"14257-14263","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["EVENT-Retriever: Event-Aware Multimodal Image Retrieval for Realistic Captions"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8831-8846","authenticated-orcid":false,"given":"Dinh-Khoi","family":"Vo","sequence":"first","affiliation":[{"name":"University of Science, VNU-HCM, Ho Chi Minh, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9351-3750","authenticated-orcid":false,"given":"Van-Loc","family":"Nguyen","sequence":"additional","affiliation":[{"name":"University of Science, VNU-HCM, Ho Chi Minh, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3046-3041","authenticated-orcid":false,"given":"Minh-Triet","family":"Tran","sequence":"additional","affiliation":[{"name":"University of Science, VNU-HCM, Ho Chi Minh, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7363-2610","authenticated-orcid":false,"given":"Trung-Nghia","family":"Le","sequence":"additional","affiliation":[{"name":"University of Science, VNU-HCM, Ho Chi Minh, Vietnam"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al., 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","unstructured":"Jinze Bai Shuai Bai Yunfei Chu Zeyu Cui Kai Dang Xiaodong Deng Yang Fan Wenbin Ge Yu Han Fei Huang et al. 2023a. Qwen technical report. arXiv preprint arXiv:2309.16609 (2023)."},{"key":"e_1_3_2_1_3_1","volume-title":"Localization, Text Reading, and Beyond. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023b. Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond. arXiv preprint arXiv:2308.12966 (2023)."},{"key":"e_1_3_2_1_4_1","volume-title":"arXiv preprint arXiv:2502.13923","author":"Bai Shuai","year":"2025","unstructured":"Shuai Bai, Keqin Chen, Xuejing Liu, Jialin Wang, Wenbin Ge, Sibo Song, Kai Dang, Peng Wang, Shijie Wang, Jun Tang, Humen Zhong, Yuanzhi Zhu, Mingkun Yang, Zhaohai Li, Jianqiang Wan, Pengfei Wang, Wei Ding, Zheren Fu, Yiheng Xu, Jiabo Ye, Xi Zhang, Tianbao Xie, Zesen Cheng, Hang Zhang, Zhibo Yang, Haiyang Xu, and Junyang Lin. 2025. Qwen2.5-VL Technical Report. arXiv preprint arXiv:2502.13923 (2025)."},{"key":"e_1_3_2_1_5_1","volume-title":"Bge m3-embedding: Multi-lingual, multi-functionality, multi-granularity text embeddings through self-knowledge distillation. arXiv preprint arXiv:2402.03216","author":"Chen Jianlv","year":"2024","unstructured":"Jianlv Chen, Shitao Xiao, Peitian Zhang, Kun Luo, Defu Lian, and Zheng Liu. 2024b. Bge m3-embedding: Multi-lingual, multi-functionality, multi-granularity text embeddings through self-knowledge distillation. arXiv preprint arXiv:2402.03216 (2024)."},{"key":"e_1_3_2_1_6_1","volume-title":"Multi-Functionality, Multi-Granularity Text Embeddings Through Self-Knowledge Distillation. Findings of the Association for Computational Linguistics: ACL","author":"Chen Jianlyu","year":"2024","unstructured":"Jianlyu Chen, Shitao Xiao, Peitian Zhang, Kun Luo, Defu Lian, and Zheng Liu. 2024c. M3-Embedding: Multi-Linguality, Multi-Functionality, Multi-Granularity Text Embeddings Through Self-Knowledge Distillation. Findings of the Association for Computational Linguistics: ACL (2024)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4231-5"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/1571941.1572114"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10791-023-09418-1"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/2.410145"},{"key":"e_1_3_2_1_12_1","volume-title":"International conference on machine learning. PMLR, 4904-4916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In International conference on machine learning. PMLR, 4904-4916."},{"key":"e_1_3_2_1_13_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730-19742."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01593"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2020.07.139"},{"key":"e_1_3_2_1_16_1","first-page":"740","volume-title":"Switzerland","author":"Lin Tsung-Yi","year":"2014","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2014. Microsoft coco: Common objects in context. In Computer vision-ECCV 2014: 13th European conference, zurich, Switzerland, September 6-12, 2014, proceedings, part v 13. Springer, 740-755."},{"key":"e_1_3_2_1_17_1","unstructured":"Haotian Liu Chunyuan Li Yuheng Li and Yong Jae Lee. 2023a. Improved Baselines with Visual Instruction Tuning."},{"key":"e_1_3_2_1_18_1","unstructured":"Haotian Liu Chunyuan Li Yuheng Li Bo Li Yuanhan Zhang Sheng Shen and Yong Jae Lee. 2024. LLaVA-NeXT: Improved reasoning OCR and world knowledge. https:\/\/llava-vl.github.io\/blog\/2024-01-30-llava-next\/"},{"key":"e_1_3_2_1_19_1","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong Jae Lee. 2023b. Visual Instruction Tuning."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548132"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00213"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3746027.3758264"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.303"},{"key":"e_1_3_2_1_24_1","volume-title":"International conference on machine learning. PmLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748-8763."},{"key":"e_1_3_2_1_25_1","volume-title":"Lxmert: Learning cross-modality encoder representations from transformers. arXiv preprint arXiv:1908.07490","author":"Tan Hao","year":"2019","unstructured":"Hao Tan and Mohit Bansal. 2019. Lxmert: Learning cross-modality encoder representations from transformers. arXiv preprint arXiv:1908.07490 (2019)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3746027.3762067"},{"key":"e_1_3_2_1_27_1","volume-title":"Git: A generative image-to-text transformer for vision and language. arXiv preprint arXiv:2205.14100","author":"Wang Jianfeng","year":"2022","unstructured":"Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, and Lijuan Wang. 2022. Git: A generative image-to-text transformer for vision and language. arXiv preprint arXiv:2205.14100 (2022)."},{"key":"e_1_3_2_1_28_1","volume-title":"Multilingual e5 text embeddings: A technical report. arXiv preprint arXiv:2402.05672","author":"Wang Liang","year":"2024","unstructured":"Liang Wang, Nan Yang, Xiaolong Huang, Linjun Yang, Rangan Majumder, and Furu Wei. 2024b. Multilingual e5 text embeddings: A technical report. arXiv preprint arXiv:2402.05672 (2024)."},{"key":"e_1_3_2_1_29_1","unstructured":"Peng Wang Shuai Bai Sinan Tan Shijie Wang Zhihao Fan Jinze Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge et al. 2024a. Qwen2-vl: Enhancing vision-language model's perception of the world at any resolution. arXiv preprint arXiv:2409.12191 (2024)."},{"key":"e_1_3_2_1_30_1","unstructured":"An Yang Anfeng Li Baosong Yang Beichen Zhang Binyuan Hui Bo Zheng Bowen Yu Chang Gao Chengen Huang Chenxu Lv et al. 2025. Qwen3 technical report. arXiv preprint arXiv:2505.09388 (2025)."},{"key":"e_1_3_2_1_31_1","volume-title":"Minicpm-v: A gpt-4v level mllm on your phone. arXiv preprint arXiv:2408.01800","author":"Yao Yuan","year":"2024","unstructured":"Yuan Yao, Tianyu Yu, Ao Zhang, Chongyi Wang, Junbo Cui, Hongji Zhu, Tianchi Cai, Haoyu Li, Weilin Zhao, Zhihui He, et al., 2024. Minicpm-v: A gpt-4v level mllm on your phone. arXiv preprint arXiv:2408.01800 (2024)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"e_1_3_2_1_33_1","volume-title":"GME: Improving Universal Multimodal Retrieval by Multimodal LLMs. arXiv preprint arXiv:2412.16855","author":"Zhang Xin","year":"2024","unstructured":"Xin Zhang, Yanzhao Zhang, Wen Xie, Mingxin Li, Ziqi Dai, Dingkun Long, Pengjun Xie, Meishan Zhang, Wenjie Li, and Min Zhang. 2024. GME: Improving Universal Multimodal Retrieval by Multimodal LLMs. arXiv preprint arXiv:2412.16855 (2024)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00866"},{"key":"e_1_3_2_1_35_1","volume-title":"Embedding: Advancing Text Embedding and Reranking Through Foundation Models. arXiv preprint arXiv:2506.05176","author":"Zhang Yanzhao","year":"2025","unstructured":"Yanzhao Zhang, Mingxin Li, Dingkun Long, Xin Zhang, Huan Lin, Baosong Yang, Pengjun Xie, An Yang, Dayiheng Liu, Junyang Lin, et al., 2025a. Qwen3 Embedding: Advancing Text Embedding and Reranking Through Foundation Models. arXiv preprint arXiv:2506.05176 (2025)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3485447.3511928"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3762038","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T03:56:57Z","timestamp":1765339017000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3762038"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":36,"alternative-id":["10.1145\/3746027.3762038","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3762038","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}