{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T02:10:48Z","timestamp":1755828648058,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":29,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,12,7]],"date-time":"2023-12-07T00:00:00Z","timestamp":1701907200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,12,7]]},"DOI":"10.1145\/3628797.3629021","type":"proceedings-article","created":{"date-parts":[[2023,12,6]],"date-time":"2023-12-06T15:25:34Z","timestamp":1701876334000},"page":"997-1002","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Diverse Search Methods and Multi-Modal Fusion for High-Performance Video Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-9053-4974","authenticated-orcid":false,"given":"Sieu","family":"Tran","sequence":"first","affiliation":[{"name":"University of Information Technology (HCM VNU), Viet Nam"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-6998-3719","authenticated-orcid":false,"given":"Duc","family":"Minh Nguyen","sequence":"additional","affiliation":[{"name":"University of Information Technology (HCM VNU), Viet Nam"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-5657-8747","authenticated-orcid":false,"given":"Triet","family":"Huynh Minh Nguyen","sequence":"additional","affiliation":[{"name":"University of Information Technology (HCM VNU), Viet Nam"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-2227-7062","authenticated-orcid":false,"given":"Danh","family":"Phuc Ngo","sequence":"additional","affiliation":[{"name":"University of Information Technology (HCM VNU), Viet Nam"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-7823-7911","authenticated-orcid":false,"given":"Thu","family":"Minh Nguyen","sequence":"additional","affiliation":[{"name":"University of Information Technology (HCM VNU), Viet Nam"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-3908-4606","authenticated-orcid":false,"given":"Hao","family":"Vo","sequence":"additional","affiliation":[{"name":"University of Information Technology (HCM VNU), Viet Nam"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-0451-5479","authenticated-orcid":false,"given":"Khiem","family":"Le","sequence":"additional","affiliation":[{"name":"University of Information Technology (HCM VNU), Viet Nam"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5381-3414","authenticated-orcid":false,"given":"Tien","family":"Do","sequence":"additional","affiliation":[{"name":"University of Information Technology (HCM VNU), Viet Nam"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6882-0070","authenticated-orcid":false,"given":"Thanh","family":"Duc Ngo","sequence":"additional","affiliation":[{"name":"University of Information Technology (HCM VNU), Viet Nam"}]}],"member":"320","published-online":{"date-parts":[[2023,12,7]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.21608\/ijicis.2021.68816.1072"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3463948.3469071"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3463948.3469070"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Lisa Anne\u00a0Hendricks Oliver Wang Eli Shechtman Josef Sivic Trevor Darrell and Bryan Russell. 2017. Localizing moments in video with natural language. (2017) 5803\u20135812.","DOI":"10.1109\/ICCV.2017.618"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_11"},{"key":"e_1_3_2_1_6_1","unstructured":"David Chen and William\u00a0B Dolan. 2011. Collecting highly parallel data for paraphrase evaluation. (2011) 190\u2013200."},{"key":"e_1_3_2_1_7_1","unstructured":"Duc-Tien Dang-Nguyen Luca Piras Michael Riegler Liting Zhou Mathias Lux and Cathal Gurrin. 2018. Overview of ImageCLEFlifelog 2018: daily living understanding and lifelog moment retrieval. (2018)."},{"key":"e_1_3_2_1_8_1","volume-title":"CEUR Workshop Proceedings.","author":"Dang\u00a0Nguyen Duc\u00a0Tien","year":"2019","unstructured":"Duc\u00a0Tien Dang\u00a0Nguyen, Luca Piras, Michael Riegler, Liting Zhou, Mathias Lux, Minh\u00a0Triet Tran, Tu-Khiem Le, Van-Tu Ninh, and Cathal Gurrin. 2019. Overview of ImageCLEFlifelog 2019: solve my life puzzle and lifelog moment retrieval. CEUR Workshop Proceedings."},{"key":"e_1_3_2_1_9_1","volume-title":"An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW53098.2021.00374"},{"key":"e_1_3_2_1_11_1","volume-title":"The 12th International Symposium on Information and Communication Technology, SoICT 2023","author":"Le\u00a0Do","year":"2023","unstructured":"Trong-Le\u00a0Do et al.2023. News Event Retrieval from Large Video Collection in Ho Chi Minh City AI Challenge 2023. In The 12th International Symposium on Information and Communication Technology, SoICT 2023, Ho Chi Minh City, Vietnam, December 7-8, 2023. ACM."},{"key":"e_1_3_2_1_12_1","volume-title":"Clip2video: Mastering video-text retrieval via image clip. arXiv preprint arXiv:2106.11097","author":"Fang Han","year":"2021","unstructured":"Han Fang, Pengfei Xiong, Luhui Xu, and Yu Chen. 2021. Clip2video: Mastering video-text retrieval via image clip. arXiv preprint arXiv:2106.11097 (2021)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/2911451.2914680"},{"key":"e_1_3_2_1_14_1","volume-title":"Proceedings of the 14th NTCIR conference. NII, 14\u201326","author":"Gurrin Cathal","year":"2019","unstructured":"Cathal Gurrin, Hideo Joho, Frank Hopfgartner, Liting Zhou, V-T Ninh, T-K Le, Rami Albatal, D-T Dang-Nguyen, and Graham Healy. 2019. Overview of the NTCIR-14 lifelog-3 task. In Proceedings of the 14th NTCIR conference. NII, 14\u201326."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3460426.3470945"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3372278.3388043"},{"key":"e_1_3_2_1_17_1","volume-title":"Use what you have: Video retrieval using representations from collaborative experts. arXiv preprint arXiv:1907.13487","author":"Liu Yang","year":"2019","unstructured":"Yang Liu, Samuel Albanie, Arsha Nagrani, and Andrew Zisserman. 2019. Use what you have: Video retrieval using representations from collaborative experts. arXiv preprint arXiv:1907.13487 (2019)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"e_1_3_2_1_19_1","volume-title":"Learning a text-video embedding from incomplete and heterogeneous data. arXiv preprint arXiv:1804.02516","author":"Miech Antoine","year":"2018","unstructured":"Antoine Miech, Ivan Laptev, and Josef Sivic. 2018. Learning a text-video embedding from incomplete and heterogeneous data. arXiv preprint arXiv:1804.02516 (2018)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3206025.3206064"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3463948.3469065"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-77004-4_1"},{"key":"e_1_3_2_1_23_1","volume-title":"International conference on machine learning. PMLR, 8748\u20138763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748\u20138763."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3463948.3469068"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3512729.3533008"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3512729.3533012"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00721"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00621"}],"event":{"name":"SOICT 2023: The 12th International Symposium on Information and Communication Technology","acronym":"SOICT 2023","location":"Ho Chi Minh Vietnam"},"container-title":["Proceedings of the 12th International Symposium on Information and Communication Technology"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3628797.3629021","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3628797.3629021","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T12:23:50Z","timestamp":1755779030000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3628797.3629021"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12,7]]},"references-count":29,"alternative-id":["10.1145\/3628797.3629021","10.1145\/3628797"],"URL":"https:\/\/doi.org\/10.1145\/3628797.3629021","relation":{},"subject":[],"published":{"date-parts":[[2023,12,7]]},"assertion":[{"value":"2023-12-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}