{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T02:10:48Z","timestamp":1755828648516,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":27,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,12,7]],"date-time":"2023-12-07T00:00:00Z","timestamp":1701907200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100006374","name":"University of Science Ho Chi Minh City","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,12,7]]},"DOI":"10.1145\/3628797.3628984","type":"proceedings-article","created":{"date-parts":[[2023,12,6]],"date-time":"2023-12-06T15:25:34Z","timestamp":1701876334000},"page":"938-944","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Zero-shot Video Retrieval using CLIP with Temporally Ordered Multi-query Scoring"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-7411-8804","authenticated-orcid":false,"given":"Huy-Giap","family":"Bui","sequence":"first","affiliation":[{"name":"University of Science, VNU-HCM, Viet Nam"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-5721-3751","authenticated-orcid":false,"given":"Minh-Huy","family":"Trinh","sequence":"additional","affiliation":[{"name":"University of Science, VNU-HCM, Viet Nam"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-8514-7430","authenticated-orcid":false,"given":"Canh-Toan","family":"Le","sequence":"additional","affiliation":[{"name":"International University, VNU-HCM, Viet Nam"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-3788-7079","authenticated-orcid":false,"given":"Quoc-Lam","family":"Vu","sequence":"additional","affiliation":[{"name":"University of Science, VNU-HCM, Viet Nam"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7049-8532","authenticated-orcid":false,"given":"Khac-Trieu","family":"Vo","sequence":"additional","affiliation":[{"name":"University of Science, VNU-HCM, Viet Nam"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,12,7]]},"reference":[{"volume-title":"Greece)","key":"e_1_3_2_1_1_1","unstructured":"2023. LSC \u201923: Proceedings of the 6th Annual ACM Lifelog Search Challenge (Thessaloniki, Greece). Association for Computing Machinery, New York, NY, USA."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592573.3593103"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_11"},{"key":"e_1_3_2_1_5_1","volume-title":"Fitclip: Refining large-scale pretrained image-text models for zero-shot video understanding tasks. arXiv preprint arXiv:2203.13371","author":"Castro Santiago","year":"2022","unstructured":"Santiago Castro and Fabian\u00a0Caba Heilbron. 2022. Fitclip: Refining large-scale pretrained image-text models for zero-shot video understanding tasks. arXiv preprint arXiv:2203.13371 (2022)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01138"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/2505515.2507880"},{"key":"e_1_3_2_1_8_1","volume-title":"The 12th International Symposium on Information and Communication Technology, SoICT 2023","author":"Le\u00a0Do","year":"2023","unstructured":"Trong-Le\u00a0Do et al.2023. News Event Retrieval from Large Video Collection in Ho Chi Minh City AI Challenge 2023. In The 12th International Symposium on Information and Communication Technology, SoICT 2023, Ho Chi Minh City, Vietnam, December 7-8, 2023. ACM."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-20308-4_3"},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings, Part IV 16","author":"Gabeur Valentin","year":"2020","unstructured":"Valentin Gabeur, Chen Sun, Karteek Alahari, and Cordelia Schmid. 2020. Multi-modal transformer for video retrieval. In Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part IV 16. Springer, 214\u2013229."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3512729.3533003"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICMEW46912.2020.9105954"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/5.58325"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3414542"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350906"},{"key":"e_1_3_2_1_16_1","unstructured":"Huaishao Luo Lei Ji Ming Zhong Yang Chen Wen Lei Nan Duan and Tianrui Li. 2021. CLIP4Clip: An Empirical Study of CLIP for End to End Video Clip Retrieval. arxiv:2104.08860\u00a0[cs.CV]"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00272"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-77004-4_1"},{"key":"e_1_3_2_1_19_1","unstructured":"Alec Radford Jong\u00a0Wook Kim Chris Hallacy Aditya Ramesh Gabriel Goh Sandhini Agarwal Girish Sastry Amanda Askell Pamela Mishkin Jack Clark Gretchen Krueger and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. arxiv:2103.00020\u00a0[cs.CV]"},{"key":"e_1_3_2_1_20_1","unstructured":"Alec Radford Jong\u00a0Wook Kim Tao Xu Greg Brockman Christine McLeavey and Ilya Sutskever. 2022. Robust Speech Recognition via Large-Scale Weak Supervision. arxiv:2212.04356\u00a0[eess.AS]"},{"key":"e_1_3_2_1_21_1","volume-title":"Vibro: Video Browsing with\u00a0Semantic and\u00a0Visual Image Embeddings","author":"Schall Konstantin","year":"2023","unstructured":"Konstantin Schall, Nico Hezel, Klaus Jung, and Kai\u00a0Uwe Barthel. 2023. Vibro: Video Browsing with\u00a0Semantic and\u00a0Visual Image Embeddings. In MultiMedia Modeling, Duc-Tien Dang-Nguyen, Cathal Gurrin, Martha Larson, Alan\u00a0F. Smeaton, Stevan Rudinac, Minh-Son Dao, Christoph Trattner, and Phoebe Chen (Eds.). Springer International Publishing, Cham, 665\u2013670."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CBMI.2019.8877397"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"crossref","unstructured":"Christian Szegedy Sergey Ioffe Vincent Vanhoucke and Alex Alemi. 2016. Inception-v4 Inception-ResNet and the Impact of Residual Connections on Learning. arxiv:1602.07261\u00a0[cs.CV]","DOI":"10.1609\/aaai.v31i1.11231"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-98355-0_16"},{"key":"e_1_3_2_1_25_1","unstructured":"Chien-Yao Wang Alexey Bochkovskiy and Hong-Yuan\u00a0Mark Liao. 2022. YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors. arxiv:2207.02696\u00a0[cs.CV]"},{"key":"e_1_3_2_1_26_1","unstructured":"Qifeng Wu Huan Wang Xu Ma and Yun Fu. 2023. Distilling Text-Image Foundation Models. https:\/\/openreview.net\/forum?id=VsqE7E-lWB"},{"key":"e_1_3_2_1_27_1","volume-title":"mplug-2: A modularized multi-modal foundation model across text, image and video. arXiv preprint arXiv:2302.00402","author":"Xu Haiyang","year":"2023","unstructured":"Haiyang Xu, Qinghao Ye, Ming Yan, Yaya Shi, Jiabo Ye, Yuanhong Xu, Chenliang Li, Bin Bi, Qi Qian, Wei Wang, 2023. mplug-2: A modularized multi-modal foundation model across text, image and video. arXiv preprint arXiv:2302.00402 (2023)."}],"event":{"name":"SOICT 2023: The 12th International Symposium on Information and Communication Technology","acronym":"SOICT 2023","location":"Ho Chi Minh Vietnam"},"container-title":["Proceedings of the 12th International Symposium on Information and Communication Technology"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3628797.3628984","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3628797.3628984","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T12:22:07Z","timestamp":1755778927000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3628797.3628984"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12,7]]},"references-count":27,"alternative-id":["10.1145\/3628797.3628984","10.1145\/3628797"],"URL":"https:\/\/doi.org\/10.1145\/3628797.3628984","relation":{},"subject":[],"published":{"date-parts":[[2023,12,7]]},"assertion":[{"value":"2023-12-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}