{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,17]],"date-time":"2025-09-17T14:50:18Z","timestamp":1758120618224,"version":"3.28.0"},"reference-count":40,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,8,15]],"date-time":"2024-08-15T00:00:00Z","timestamp":1723680000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,8,15]],"date-time":"2024-08-15T00:00:00Z","timestamp":1723680000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,8,15]]},"DOI":"10.1109\/mapr63514.2024.10660857","type":"proceedings-article","created":{"date-parts":[[2024,9,10]],"date-time":"2024-09-10T18:23:27Z","timestamp":1725992607000},"page":"1-6","source":"Crossref","is-referenced-by-count":2,"title":["VISA: Video Interactive Search with Advanced Visual Programming"],"prefix":"10.1109","author":[{"given":"Duc-Tuan","family":"Luu","sequence":"first","affiliation":[{"name":"University of Information Technology,Ho Chi Minh City,Vietnam"}]},{"given":"Khanh-Linh","family":"Bui-Le","sequence":"additional","affiliation":[{"name":"University of Information Technology,Ho Chi Minh City,Vietnam"}]},{"given":"Duy-Ngoc","family":"Nguyen","sequence":"additional","affiliation":[{"name":"University of Information Technology,Ho Chi Minh City,Vietnam"}]},{"given":"Vinh-Tiep","family":"Nguyen","sequence":"additional","affiliation":[{"name":"University of Information Technology,Ho Chi Minh City,Vietnam"}]},{"given":"Minh-Triet","family":"Tran","sequence":"additional","affiliation":[{"name":"University of Science,Ho Chi Minh City,Vietnam"}]}],"member":"263","reference":[{"key":"ref1","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"NeurIPS"},{"issue":"240","key":"ref2","first-page":"1","article-title":"Palm: Scaling language modeling with pathways","volume":"24","author":"Chowdhery","year":"2023","journal-title":"J. Mach. Learn. Res"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01436"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1145\/3628797.3628940"},{"key":"ref5","article-title":"Magicbrush: A manually annotated dataset for instruction-guided image editing","author":"Zhang","year":"2023","journal-title":"NeurIPS"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1514"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","volume-title":"ICLR","author":"Dosovitskiy","key":"ref8"},{"article-title":"Beit: Bert pre-training of image transformers","volume-title":"ICLR","author":"Bao","key":"ref9"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20215"},{"key":"ref11","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"ICML","author":"Radford"},{"key":"ref12","article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2019","journal-title":"NAACL-HLT"},{"article-title":"mslam: Massively multilingual joint pre-training for speech and text","year":"2022","author":"Bapna","key":"ref13"},{"article-title":"Socratic models: Composing zero-shot multimodal reasoning with language","volume-title":"ICLR","author":"Zeng","key":"ref14"},{"article-title":"Gemini: a family of highly capable multimodal models","year":"2023","author":"Team","key":"ref15"},{"issue":"11","key":"ref16","first-page":"120","article-title":"The opencv library","volume":"25","author":"Bradski","year":"2000","journal-title":"Dr Dobbs Journal: Software Tools for the Professional Programmer"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-98355-0_52"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-98355-0_57"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/3592573.3593098"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"ref21","article-title":"Faster r-cnn: Towards real-time object detection with region proposal networks","author":"Ren","year":"2015","journal-title":"NeurIPS"},{"key":"ref22","first-page":"12 888","article-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume-title":"ICML","author":"Li"},{"key":"ref23","first-page":"10965","article-title":"Grounded language-image pretraining","volume-title":"CVPR","author":"Li"},{"key":"ref24","article-title":"Attention is all you need","author":"Vaswani","year":"2017","journal-title":"NeurIPS"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/TBDATA.2019.2921572"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6812"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_11"},{"key":"ref28","first-page":"28 492","article-title":"Robust speech recognition via large-scale weak supervision","volume-title":"ICML","author":"Radford"},{"article-title":"Grounding dino: Marrying dino with grounded pre-training for open-set object detection","year":"2023","author":"Liu","key":"ref29"},{"article-title":"Grounded sam: Assembling open-world models for diverse visual tasks","year":"2024","author":"Ren","key":"ref30"},{"article-title":"A task is worth one word: Learning with task prompts for high-quality versatile image inpainting","year":"2023","author":"Zhuang","key":"ref31"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"article-title":"Gpt-4 technical report","year":"2023","author":"Achiam","key":"ref33"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58621-8_6"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19836-6_6"},{"article-title":"Sdedit: Guided image synthesis and editing with stochastic differential equations","volume-title":"ICLR","author":"Meng","key":"ref36"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19784-0_41"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00585"},{"article-title":"Hive: Harnessing human feedback for instructional visual editing","year":"2023","author":"Zhang","key":"ref39"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01764"}],"event":{"name":"2024 International Conference on Multimedia Analysis and Pattern Recognition (MAPR)","start":{"date-parts":[[2024,8,15]]},"location":"Da Nang, Vietnam","end":{"date-parts":[[2024,8,16]]}},"container-title":["2024 International Conference on Multimedia Analysis and Pattern Recognition (MAPR)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10660665\/10660652\/10660857.pdf?arnumber=10660857","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,11]],"date-time":"2024-09-11T05:34:27Z","timestamp":1726032867000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10660857\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,15]]},"references-count":40,"URL":"https:\/\/doi.org\/10.1109\/mapr63514.2024.10660857","relation":{},"subject":[],"published":{"date-parts":[[2024,8,15]]}}}