{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,19]],"date-time":"2026-03-19T17:50:07Z","timestamp":1773942607622,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":37,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,12,7]],"date-time":"2023-12-07T00:00:00Z","timestamp":1701907200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,12,7]]},"DOI":"10.1145\/3628797.3628975","type":"proceedings-article","created":{"date-parts":[[2023,12,6]],"date-time":"2023-12-06T15:25:34Z","timestamp":1701876334000},"page":"931-937","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["AGAIN: A Multimodal Human-Centric Event Retrieval System using dual image-to-text representations"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-1279-9756","authenticated-orcid":false,"given":"Minh-Nam","family":"Tran","sequence":"first","affiliation":[{"name":"University of Science, VNUHCM, Viet Nam"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5408-8138","authenticated-orcid":false,"given":"Tuan-An","family":"To","sequence":"additional","affiliation":[{"name":"University of Science, VNUHCM, Viet Nam"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4683-7446","authenticated-orcid":false,"given":"Viet-Nhat","family":"Thai","sequence":"additional","affiliation":[{"name":"University of Science, VNUHCM, Viet Nam"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-1117-5285","authenticated-orcid":false,"given":"Thanh-Duy","family":"Cao","sequence":"additional","affiliation":[{"name":"University of Science, VNUHCM, Viet Nam"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0814-8048","authenticated-orcid":false,"given":"Trong-Tin","family":"Nguyen","sequence":"additional","affiliation":[{"name":"University of Science, VNUHCM, Viet Nam"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,12,7]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3463948.3469069"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3512729.3533006"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592573.3593103"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3463948.3469071"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-2029"},{"key":"e_1_3_2_1_6_1","unstructured":"Duc-Tien Dang-Nguyen Luca Piras Michael Riegler Giulia Boato Liting Zhou and Cathal Gurrin. 2017. Overview of ImageCLEF lifelog 2017: lifelog retrieval and summarization. (2017)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_8_1","volume-title":"An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ITCC.2004.1286584"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3463948.3469066"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/2911451.2914680"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.3169\/mta.7.46"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3379172.3391715"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCAIS56082.2022.9990349"},{"key":"e_1_3_2_1_15_1","volume-title":"[n. d.]. YOLO by Ultralytics (Version 8.0.0) [Computer software]. https:\/\/github.com\/ultralytics\/ultralytics","author":"Chaurasia G.","year":"2023","unstructured":"Jocher, G. Chaurasia, A., Qiu, and J. (2023).[n. d.]. YOLO by Ultralytics (Version 8.0.0) [Computer software]. https:\/\/github.com\/ultralytics\/ultralytics."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/TBDATA.2019.2921572"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-37734-2_71"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3379172.3391724"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3463948.3469060"},{"key":"e_1_3_2_1_20_1","unstructured":"Junnan Li Dongxu Li Silvio Savarese and Steven Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. arxiv:2301.12597\u00a0[cs.CV]"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-37734-2_70"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2016.7899663"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-37734-2_68"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"Omkar\u00a0M. Parkhi Andrea Vedaldi and Andrew Zisserman. 2015. Deep Face Recognition. In BMVC.","DOI":"10.5244\/C.29.41"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-67835-7_46"},{"key":"e_1_3_2_1_27_1","volume-title":"Learning Transferable Visual Models From Natural Language Supervision. CoRR abs\/2103.00020","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. CoRR abs\/2103.00020 (2021). arXiv:2103.00020https:\/\/arxiv.org\/abs\/2103.00020"},{"key":"e_1_3_2_1_28_1","unstructured":"Alec Radford Jong\u00a0Wook Kim Tao Xu Greg Brockman Christine McLeavey and Ilya Sutskever. 2022. Robust Speech Recognition via Large-Scale Weak Supervision. arxiv:2212.04356\u00a0[eess.AS]"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Nils Reimers and Iryna Gurevych. 2019. Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks. arxiv:1908.10084\u00a0[cs.CL]","DOI":"10.18653\/v1\/D19-1410"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/2350204.2350205"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298682"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASYU50717.2020.9259802"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3463948.3469073"},{"key":"e_1_3_2_1_34_1","volume-title":"TransNet: A deep network for fast detection of common shot transitions. arXiv preprint arXiv:1906.03363","author":"Sou\u010dek Tom\u00e1\u0161","year":"2019","unstructured":"Tom\u00e1\u0161 Sou\u010dek, Jaroslav Moravec, and Jakub Loko\u010d. 2019. TransNet: A deep network for fast detection of common shot transitions. arXiv preprint arXiv:1906.03363 (2019)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3379172.3391719"},{"key":"e_1_3_2_1_36_1","unstructured":"Yuxin Wu Alexander Kirillov Francisco Massa Wan-Yen Lo and Ross Girshick. 2019. Detectron2. https:\/\/github.com\/facebookresearch\/detectron2."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2016.2603342"}],"event":{"name":"SOICT 2023: The 12th International Symposium on Information and Communication Technology","location":"Ho Chi Minh Vietnam","acronym":"SOICT 2023"},"container-title":["Proceedings of the 12th International Symposium on Information and Communication Technology"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3628797.3628975","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3628797.3628975","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T12:23:36Z","timestamp":1755779016000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3628797.3628975"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12,7]]},"references-count":37,"alternative-id":["10.1145\/3628797.3628975","10.1145\/3628797"],"URL":"https:\/\/doi.org\/10.1145\/3628797.3628975","relation":{},"subject":[],"published":{"date-parts":[[2023,12,7]]},"assertion":[{"value":"2023-12-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}