{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T18:47:39Z","timestamp":1755802059119,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":15,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1145\/3731715.3733490","type":"proceedings-article","created":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T18:29:43Z","timestamp":1750876183000},"page":"2053-2057","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Optimization of CLIP Models for Domain-Specific Video Search"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-1691-1858","authenticated-orcid":false,"given":"Kazuya","family":"Ueki","sequence":"first","affiliation":[{"name":"Meisei University, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7143-2781","authenticated-orcid":false,"given":"Haruki","family":"Sato","sequence":"additional","affiliation":[{"name":"Agoop Corporation, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4542-6243","authenticated-orcid":false,"given":"Yuma","family":"Suzuki","sequence":"additional","affiliation":[{"name":"SoftBank Corporation, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8232-5922","authenticated-orcid":false,"given":"Takayuki","family":"Hori","sequence":"additional","affiliation":[{"name":"SoftBank Corporation, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-2041-8713","authenticated-orcid":false,"given":"Hiroki","family":"Takushima","sequence":"additional","affiliation":[{"name":"SoftBank Corporation, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7689-4544","authenticated-orcid":false,"given":"Takumi","family":"Takada","sequence":"additional","affiliation":[{"name":"SB Intuitions Corporation, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-7050-035X","authenticated-orcid":false,"given":"Hayato","family":"Tanoue","sequence":"additional","affiliation":[{"name":"SoftBank Corporation, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-0170-4822","authenticated-orcid":false,"given":"Aiswariya","family":"Manoj Kumar","sequence":"additional","affiliation":[{"name":"SoftBank Corporation, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-0344-5494","authenticated-orcid":false,"given":"Hiroki","family":"Nishihara","sequence":"additional","affiliation":[{"name":"SoftBank Corporation, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-1248-0254","authenticated-orcid":false,"given":"Yuki","family":"Shibata","sequence":"additional","affiliation":[{"name":"SoftBank Corporation, Tokyo, Japan"}]}],"member":"320","published-online":{"date-parts":[[2025,6,30]]},"reference":[{"key":"e_1_3_2_1_1_1","volume":"201","author":"Alayrac M.","unstructured":"J.-B. Alayrac M. Tapaswi I. Laptev A. Miech, D. Zhukov and J. Sivic. 2019. HowTo100M: Learning a Text-Video Embedding by Watching Hundred Million Narrated Video Clips. In In Proc. of the IEEE\/CVF International Conference on Computer Vision (ICCV).","journal-title":"J. Sivic."},{"key":"e_1_3_2_1_2_1","unstructured":"G. Awad K. Curtis A. A. Butt J. Fiscus A. Godil Y. Lee A. Delgado E. Godard B. Chocot L. Diduch Y. Graham and G. Qu\u00e9not. 2023. TRECVID 2023 - A series of evaluation tracks in video understanding. In Proc. of TRECVID 2023."},{"key":"e_1_3_2_1_3_1","volume-title":"Visual Instruction Tuning. In In Proc. of the International Conference on Neural Information Processing Systems (NeurIPS).","author":"Wu Y. J.","year":"2023","unstructured":"Q. Wu Y. J. Lee H. Liu, C. Li. 2023a. Visual Instruction Tuning. In In Proc. of the International Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_4_1","volume-title":"Improved Baselines with Visual Instruction Tuning. arXiv:2310.03744","author":"Li Y. J.","year":"2023","unstructured":"Y. Li Y. J. Lee H. Liu, C. Li. 2023b. Improved Baselines with Visual Instruction Tuning. arXiv:2310.03744 (2023)."},{"key":"e_1_3_2_1_5_1","volume-title":"Proc. of TRECVID","author":"He R.","year":"2023","unstructured":"J. He, R. Li, J. Guo, H. Zhang, M. Li, Z. Wu, Z. Wang, B. Du, and C. Liang. 2023. WHU-NERCMS at TRECVID 2023: Ad-hoc Video Search (AVS) and Deep Video Understanding (DVU) Tasks. In Proc. of TRECVID 2023."},{"key":"e_1_3_2_1_6_1","unstructured":"S. Savarese J. Li D. Li and S. Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. arXiv:2301.12597 (2023)."},{"key":"e_1_3_2_1_7_1","volume":"201","author":"Xu L.","unstructured":"C. Xu L. Zhou and J. J. Corso. 2018. Towards Automatic Learning of Procedures from Web Instructional Videos. In In Proc. of the AAAI Conference on Artificial Intelligence.","journal-title":"J. Corso."},{"key":"e_1_3_2_1_8_1","volume-title":"BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. arXiv preprint arXiv:2201.12086","author":"Li D.","year":"2022","unstructured":"J. Li, D. Li, C. Xiong, and S. Hoi. 2022. BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. arXiv preprint arXiv:2201.12086 (2022)."},{"volume-title":"Robust Fine-Tuning of Zero-Shot Models. In In Proc. of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 7959--7971","author":"Kim M.","key":"e_1_3_2_1_9_1","unstructured":"J. W. Kim M. Li S. Kornblith R. Roelofs R. G. Lopes H. Hajishirzi A. Farhadi H. Namkoong M. Wortsman, G. Ilharco and L. Schmidt. 2022. Robust Fine-Tuning of Zero-Shot Models. In In Proc. of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 7959--7971."},{"key":"e_1_3_2_1_10_1","unstructured":"A. Radford J. W. Kim C. Hallacy A. Ramesh G. Goh S. Agarwal G. Sastry A. Askell P. Mishkin J. Clark G. Krueger and I. Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. arXiv preprint arXiv:2103.00020 (2021)."},{"key":"e_1_3_2_1_11_1","volume-title":"Butt","author":"Luca Rossetto","year":"2019","unstructured":"Luca Rossetto, Heiko Schuldt, George Awad, and Asad A. Butt. 2019. V3C - A Research Video Collection. In MultiMedia Modeling, Ioannis Kompatsiaris, Benoit Huet, Vasileios Mezaris, Cathal Gurrin, Wen-Huang Cheng, and Stefanos Vrochidis (Eds.). Springer International Publishing, Cham, 349--360."},{"key":"e_1_3_2_1_12_1","volume":"202","author":"Schuhmann R.","unstructured":"C. Schuhmann, R. Beaumont, R. Vencu, C. Gordon, R. Wightman, M. Cherti, T. Coombes, A. Katta, C. Mullis, M. Wortsman, P. Schramowski, S. Kundurthy, K. Crowson, L. Schmidt, R. Kaczmarczyk, and J. Jitsev. 2022. LAION-5B: An open large-scale dataset for training next generation image-text models. In 36th Conference on Neural Information Processing Systems (NeurIPS).","journal-title":"J. Jitsev."},{"key":"e_1_3_2_1_13_1","volume-title":"Proc. of TRECVID","author":"Ueki Y.","year":"2022","unstructured":"K. Ueki, Y. Suzuki, H. Takushima, H. Okamoto, H. Tanoue, and T. Hori. 2022. Waseda_Meisei_SoftBank at TRECVID 2022. In Proc. of TRECVID 2022."},{"volume-title":"In Proc. of the International Conference on Learning Representations (ICLR).","author":"Ma W.","key":"e_1_3_2_1_14_1","unstructured":"J. Ma W. Tian R. Feng Y. Zhang Y. Li Y. Guo X. Huang, Y. Zhang and L. Zhang. 2024. Tag2Text: Guiding Vision-Language Model via Image Tagging. In In Proc. of the International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_15_1","first-page":"18123","volume-title":"LiT: Zero-Shot Transfer With Locked-Image Text Tuning. In In Proc. of the IEEE\/CVF Conference on Computer Vision and Pattern (CVPR)","author":"Mustafa A.","unstructured":"B. Mustafa A. Steiner D. Keysers A. Kolesnikov X. Zhai, X. Wang and L. Beyer. 2022. LiT: Zero-Shot Transfer With Locked-Image Text Tuning. In In Proc. of the IEEE\/CVF Conference on Computer Vision and Pattern (CVPR), pp.18123--18133."}],"event":{"name":"ICMR '25: International Conference on Multimedia Retrieval","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Chicago IL USA","acronym":"ICMR '25"},"container-title":["Proceedings of the 2025 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731715.3733490","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T04:14:00Z","timestamp":1755749640000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731715.3733490"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":15,"alternative-id":["10.1145\/3731715.3733490","10.1145\/3731715"],"URL":"https:\/\/doi.org\/10.1145\/3731715.3733490","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]},"assertion":[{"value":"2025-06-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}