{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:48:30Z","timestamp":1778082510845,"version":"3.51.4"},"reference-count":58,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100019635","name":"IITP","doi-asserted-by":"publisher","award":["RS-202400443251,RS-2024-00457882"],"award-info":[{"award-number":["RS-202400443251,RS-2024-00457882"]}],"id":[{"id":"10.13039\/100019635","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100007431","name":"NRF","doi-asserted-by":"publisher","award":["2023R1A2C2005373"],"award-info":[{"award-number":["2023R1A2C2005373"]}],"id":[{"id":"10.13039\/100007431","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.02067","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"22263-22273","source":"Crossref","is-referenced-by-count":1,"title":["Bidirectional Likelihood Estimation with Multi-Modal Large Language Models for Text-Video Retrieval"],"prefix":"10.1109","author":[{"given":"Dohwan","family":"Ko","sequence":"first","affiliation":[{"name":"Korea University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ji Soo","family":"Lee","sequence":"additional","affiliation":[{"name":"Korea University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Minhyuk","family":"Choi","sequence":"additional","affiliation":[{"name":"Korea University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zihang","family":"Meng","sequence":"additional","affiliation":[{"name":"Meta GenAI"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hyunwoo J.","family":"Kim","sequence":"additional","affiliation":[{"name":"KAIST"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.303"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.618"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"ref7","article-title":"Clip-vip: Adapting pretrained image-text model to video-language representation alignment","author":"Xue","year":"2023","journal-title":"ICLR"},{"key":"ref8","article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2019","journal-title":"NAACL"},{"key":"ref9","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021","journal-title":"ICML"},{"key":"ref10","article-title":"Videochat-flash: Hierarchical compression for long-context video modeling","author":"Li","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref11","article-title":"Qwen Team","year":"2025","journal-title":"Qwen2.5-vl"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02095"},{"key":"ref13","article-title":"Llava-onevision: Easy visual task transfer","author":"Li","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref14","article-title":"Llamo: Large language model-based molecular graph assistant","author":"Park","year":"2024","journal-title":"NeurIPS"},{"key":"ref15","article-title":"Internvideo2. 5: Empowering video mllms with long and rich context modeling","author":"Wang","year":"2025","journal-title":"arXiv preprint arXiv"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i4.32474"},{"key":"ref17","article-title":"Internvl3: Exploring advanced training and test-time recipes for open-source multimodal models","author":"Zhu","year":"2025","journal-title":"arXiv preprint arXiv"},{"key":"ref18","article-title":"St-vlm: Kinematic instruction tuning for spatio-temporal reasoning in vision-language models","author":"Ko","year":"2025","journal-title":"arXiv preprint arXiv"},{"key":"ref19","article-title":"Mm-embed: Universal multimodal retrieval with multimodal 11 ms","author":"Lin","year":"2025","journal-title":"ICLR"},{"key":"ref20","article-title":"Mllm is a strong reranker: Advancing multimodal retrieval-augmented generation via knowledge-enhanced reranking and noise-injected training","author":"Chen","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.00380"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.879"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01251"},{"key":"ref24","article-title":"Overcoming language priors in visual question answering with adversarial regularization","author":"Ramakrishnan","year":"2018","journal-title":"NeurIPS"},{"key":"ref25","article-title":"Rubi: Reducing unimodal biases for visual question answering","author":"Cadene","year":"2019","journal-title":"NeurIPS"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01316"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0987-1"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01826"},{"key":"ref29","article-title":"Qwen2. 5 technical report","author":"Yang","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref30","article-title":"Lora: Low-rank adaptation of large language models","author":"Hu","year":"2022","journal-title":"ICLR"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.261"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.7551\/mitpress\/7503.003.0029"},{"key":"ref33","article-title":"Passage re-ranking with bert","author":"Nogueira","year":"2019","journal-title":"arXiv preprint arXiv"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.550"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1145\/3397271.3401075"},{"key":"ref36","article-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","author":"Li","year":"2023","journal-title":"ICML"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73013-9_23"},{"key":"ref38","article-title":"Internvid: A large-scale video-text dataset for multimodal understanding and generation","author":"Wang","year":"2024","journal-title":"ICLR"},{"key":"ref39","article-title":"Internvideo: General video foundation models via generative and discriminative learning","author":"Wang","year":"2022","journal-title":"arXiv preprint arXiv"},{"key":"ref40","article-title":"Videococa: Videotext modeling with zero-shot transfer from contrastive captioners","author":"Yan","year":"2022","journal-title":"arXiv preprint arXiv"},{"key":"ref41","article-title":"Videoprism: A foundational visual encoder for video understanding","author":"Zhao","year":"2024","journal-title":"ICML"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02563"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01031"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52729.2023.01838"},{"key":"ref45","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","author":"Li","year":"2021","journal-title":"NeurIPS"},{"key":"ref46","article-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation","author":"Li","year":"2022","journal-title":"ICML"},{"key":"ref47","article-title":"Gpt-4 technical report","author":"Achiam","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02520"},{"key":"ref49","article-title":"Internlm-xcomposer-2.5: A versatile large vision language model supporting long-contextual input and output","author":"Zhang","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4231-5"},{"key":"ref51","article-title":"Mme: A comprehensive evaluation benchmark for multimodal large language models","author":"Fu","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72658-3_13"},{"key":"ref53","article-title":"Seed-bench: Benchmarking multimodal 11 ms with generative comprehension","author":"Li","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref54","article-title":"Video-mme: The first-ever comprehensive evaluation benchmark of multi-modal 11 ms in video analysis","author":"Fu","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref55","article-title":"Mlvu: A comprehensive benchmark for multi-task long video understanding","author":"Zhou","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00965"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00495"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19781-9_19"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11443298.pdf?arnumber=11443298","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T06:21:14Z","timestamp":1777530074000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11443298\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":58,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.02067","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}