{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,6]],"date-time":"2025-02-06T05:27:57Z","timestamp":1738819677448,"version":"3.37.0"},"reference-count":18,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,9,18]],"date-time":"2024-09-18T00:00:00Z","timestamp":1726617600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,9,18]],"date-time":"2024-09-18T00:00:00Z","timestamp":1726617600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100009427","name":"Telecommunications Advancement Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100009427","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,9,18]]},"DOI":"10.1109\/cbmi62980.2024.10859201","type":"proceedings-article","created":{"date-parts":[[2025,2,4]],"date-time":"2025-02-04T18:30:52Z","timestamp":1738693852000},"page":"1-7","source":"Crossref","is-referenced-by-count":0,"title":["Elevating Video Retrieval Capabilities: A Cross-Modal Approach Utilizing Text and Image Generative Models"],"prefix":"10.1109","author":[{"given":"Kazuya","family":"Ueki","sequence":"first","affiliation":[{"name":"Meisei University,Tokyo,Japan"}]},{"given":"Yuma","family":"Suzuki","sequence":"additional","affiliation":[{"name":"SoftBank Corp.,Tokyo,Japan"}]},{"given":"Haruki","family":"Sato","sequence":"additional","affiliation":[{"name":"SoftBank Corp.,Tokyo,Japan"}]},{"given":"Takayuki","family":"Hori","sequence":"additional","affiliation":[{"name":"SoftBank Corp.,Tokyo,Japan"}]},{"given":"Takumi","family":"Takada","sequence":"additional","affiliation":[{"name":"SoftBank Corp.,Tokyo,Japan"}]},{"given":"Hiroki","family":"Takushima","sequence":"additional","affiliation":[{"name":"SoftBank Corp.,Tokyo,Japan"}]},{"given":"Hayato","family":"Tanoue","sequence":"additional","affiliation":[{"name":"SoftBank Corp.,Tokyo,Japan"}]},{"given":"Aiswariya Manoj","family":"Kumar","sequence":"additional","affiliation":[{"name":"SoftBank Corp.,Tokyo,Japan"}]},{"given":"Hiroki","family":"Nishihara","sequence":"additional","affiliation":[{"name":"SoftBank Corp.,Tokyo,Japan"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00468"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0987-1"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.618"},{"journal-title":"Learning Transferable Visual Models From Natural Language Super-vision","year":"2021","author":"Radford","key":"ref5"},{"journal-title":"BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation","year":"2022","author":"Li","key":"ref6"},{"journal-title":"LAION-400M: Open Dataset of CLIP-Filtered 400 Million Image-Text Pairs","year":"2021","author":"Schuhmann","key":"ref7"},{"key":"ref8","article-title":"LAION-5B: An open large-scale dataset for training next generation image-text models","volume-title":"36th Conference on Neural Information Processing Systems (NeurIPS)","author":"Schuhmann","year":"2022"},{"key":"ref9","article-title":"TRECVID 2023 - A series of evaluation tracks in video understanding","volume-title":"Proc. of TRECVID 2023","author":"Awad","year":"2023"},{"key":"ref10","article-title":"Waseda_Meisei at TRECVID 2017: Ad-hoc Video Search","volume-title":"Notebook paper of the TRECVID 2017 Workshop","author":"Ueki","year":"2017"},{"key":"ref11","article-title":"WHU-NERCMS at TRECVID 2023: Ad-hoc Video Search (AVS) and Deep Video Uunderstaning (DVU) Tasks","volume-title":"Proc. of TRECVID 2023","author":"He","year":"2023"},{"key":"ref12","article-title":"Waseda_Meisei_SoftBank at TRECVID 2023","volume-title":"Proc. of TRECVID 2023","author":"Ueki","year":"2023"},{"key":"ref13","article-title":"Mind the Gap: Understanding the Modality Gap in Multi-modal Contrastive Represen-tation Learning","volume-title":"Proc. of the Conference on Neural Information Processing Systems (NeurIPS)","author":"Liang","year":"2022"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref15","article-title":"Waseda_Meisei_SoftBank at TRECVID 2022","volume-title":"Proc. of TRECVID 2022","author":"Ueki","year":"2022"},{"key":"ref16","article-title":"Renmin University of China at TRECVID 2022: Improving Video Search by Feature Fusion and Negation Understanding","volume-title":"Proc. of TRECVID 2022","author":"Li","year":"2022"},{"key":"ref17","article-title":"ITI-CERTH participation in ActEV and AVS Tracks of TRECVID 2022","volume-title":"Proc. of TRECVID 2022","author":"Gkountakos","year":"2022"},{"key":"ref18","article-title":"Renmin University of China and Tencent at TRECVID 2023: Harnessing Pre-trained Models for Ad-hoc Video Search","volume-title":"Proc. of TRECVID 2023","author":"Li","year":"2023"}],"event":{"name":"2024 International Conference on Content-Based Multimedia Indexing (CBMI)","start":{"date-parts":[[2024,9,18]]},"location":"Reykjavik, Iceland","end":{"date-parts":[[2024,9,20]]}},"container-title":["2024 International Conference on Content-Based Multimedia Indexing (CBMI)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10858870\/10858871\/10859201.pdf?arnumber=10859201","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,2,5]],"date-time":"2025-02-05T18:44:50Z","timestamp":1738781090000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10859201\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,18]]},"references-count":18,"URL":"https:\/\/doi.org\/10.1109\/cbmi62980.2024.10859201","relation":{},"subject":[],"published":{"date-parts":[[2024,9,18]]}}}