{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,12]],"date-time":"2026-03-12T14:11:56Z","timestamp":1773324716423,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":19,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,1,19]],"date-time":"2024-01-19T00:00:00Z","timestamp":1705622400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,1,19]]},"DOI":"10.1145\/3653804.3654608","type":"proceedings-article","created":{"date-parts":[[2024,6,1]],"date-time":"2024-06-01T12:22:26Z","timestamp":1717244546000},"page":"1-6","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["DroneGPT: Zero-shot Video Question Answering For Drones"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-6463-0726","authenticated-orcid":false,"given":"Hongjie","family":"Qiu","sequence":"first","affiliation":[{"name":"School of Computer Science, Hangzhou Dianzi University, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-3583-4602","authenticated-orcid":false,"given":"Jinqiang","family":"Li","sequence":"additional","affiliation":[{"name":"School of Computer Science, Hangzhou Dianzi University, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-3047-183X","authenticated-orcid":false,"given":"Junhao","family":"Gan","sequence":"additional","affiliation":[{"name":"School of Computer Science, Hangzhou Dianzi University, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-7332-1836","authenticated-orcid":false,"given":"Shuwen","family":"Zheng","sequence":"additional","affiliation":[{"name":"School of Computer Science, Hangzhou Dianzi University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7077-4947","authenticated-orcid":false,"given":"Liqi","family":"Yan","sequence":"additional","affiliation":[{"name":"School of Computer Science, Hangzhou Dianzi University, China"}]}],"member":"320","published-online":{"date-parts":[[2024,6]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Towards general purpose vision systems. ArXiv, abs\/2104.00743","author":"Tanmay Gupta","year":"2021","unstructured":"Tanmay Gupta, Amita Kamath, Aniruddha Kembhavi, and Derek Hoiem. Towards general purpose vision systems. ArXiv, abs\/2104.00743, 2021. 2"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Liu Shilong Zhaoyang Zeng Tianhe Ren Feng Li Hao Zhang Jie Yang Chun-yue Li Jianwei Yang Hang Su Jun-Juan Zhu and Lei Zhang. \u201cGrounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection.\u201d\u00a0ArXiv\u00a0abs\/2303.05499 (2023): n. pag.","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"e_1_3_2_1_3_1","first-page":"14953","volume-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)\u00a0(2022)","author":"Gupta","unstructured":"Gupta, Tanmay and Aniruddha Kembhavi. \u201cVisual Programming: Compositional visual reasoning without training.\u201d\u00a02023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)\u00a0(2022): 14953-14962."},{"issue":"11","key":"e_1_3_2_1_4_1","first-page":"120","article-title":"The opencv library","volume":"25","author":"Gary Bradski","year":"2000","unstructured":"Gary Bradski. The opencv library. Dr. Dobb's Journal: Software Tools for the Professional Programmer, 25(11):120\u2013 123, 2000. 2","journal-title":"Dr. Dobb's Journal: Software Tools for the Professional Programmer"},{"key":"e_1_3_2_1_5_1","volume-title":"Dino: Detr with improved denoising anchor boxes for end-to-end object detection","author":"Hao Zhang","year":"2022","unstructured":"Hao Zhang, Feng Li, Shilong Liu, Lei Zhang, Hang Su, Jun Zhu, Lionel M. Ni, and Heung-Yeung Shum. Dino: Detr with improved denoising anchor boxes for end-to-end object detection, 2022."},{"key":"e_1_3_2_1_6_1","volume-title":"Conference on Computer Vision\u00a0(2014)","author":"Lin","unstructured":"Lin, Tsung-Yi, Michael Maire, Serge J. Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r and C. Lawrence Zitnick. \u201cMicrosoft COCO: Common Objects in Context.\u201d\u00a0European Conference on Computer Vision\u00a0(2014)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00030"},{"key":"e_1_3_2_1_8_1","volume-title":"Grounded Language-Image Pre-training. 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 10955-10965","author":"Li L.H.","year":"2021","unstructured":"Li, L.H., Zhang, P., Zhang, H., Yang, J., Li, C., Zhong, Y., Wang, L., Yuan, L., Zhang, L., Hwang, J., Chang, K., & Gao, J. (2021). Grounded Language-Image Pre-training. 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 10955-10965."},{"key":"e_1_3_2_1_9_1","volume-title":"VQGAN-CLIP: Open Domain Image Generation and Editing with Natural Language Guidance. European Conference on Computer Vision.","author":"Crowson K.","year":"2022","unstructured":"Crowson, K., Biderman, S., Kornis, D., Stander, D., Hallahan, E., Castricato, L., & Raff, E. (2022). VQGAN-CLIP: Open Domain Image Generation and Editing with Natural Language Guidance. European Conference on Computer Vision."},{"key":"e_1_3_2_1_10_1","volume-title":"Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese. ArXiv, abs\/2211.01335","author":"Yang A.","year":"2022","unstructured":"Yang, A., Pan, J., Lin, J., Men, R., Zhang, Y., Zhou, J., & Zhou, C. (2022). Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese. ArXiv, abs\/2211.01335."},{"key":"e_1_3_2_1_11_1","volume-title":"Learning Transferable Visual Models From Natural Language Supervision. International Conference on Machine Learning.","author":"Radford A.","year":"2021","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., Krueger, G., & Sutskever, I. (2021). Learning Transferable Visual Models From Natural Language Supervision. International Conference on Machine Learning."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2019.2938015"},{"key":"e_1_3_2_1_13_1","volume-title":"Multi-modal Factorized Bilinear Pooling with Co-attention Learning for Visual Question Answering. 2017 IEEE International Conference on Computer Vision (ICCV)","author":"Yu Z.","year":"2017","unstructured":"Yu, Z., Yu, J., Fan, J., & Tao, D. (2017). Multi-modal Factorized Bilinear Pooling with Co-attention Learning for Visual Question Answering. 2017 IEEE International Conference on Computer Vision (ICCV), 1839-1848."},{"key":"e_1_3_2_1_14_1","volume-title":"Location-Aware Graph Convolutional Networks for Video Question Answering. AAAI Conference on Artificial Intelligence.","author":"Huang D.","year":"2020","unstructured":"Huang, D., Chen, P., Zeng, R., Du, Q., Tan, M., & Gan, C. (2020). Location-Aware Graph Convolutional Networks for Video Question Answering. AAAI Conference on Artificial Intelligence."},{"key":"e_1_3_2_1_15_1","first-page":"8376","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Jiaxin Shi","year":"2019","unstructured":"Jiaxin Shi, Hanwang Zhang, Juanzi Li; Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2019, pp. 8376-8384"},{"key":"e_1_3_2_1_16_1","unstructured":"Alayrac Jean-Baptiste Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katie Millican Malcolm Reynolds Roman Ring Eliza Rutherford Serkan Cabi Tengda Han Zhitao Gong Sina Samangooei Marianne Monteiro Jacob Menick Sebastian Borgeaud Andy Brock Aida Nematzadeh Sahand Sharifzadeh Mikolaj Binkowski Ricardo Barreira Oriol Vinyals Andrew Zisserman and Karen Simonyan. \u201cFlamingo: a Visual Language Model for Few-Shot Learning.\u201d ArXiv abs\/2204.14198 (2022): n. pag."},{"key":"e_1_3_2_1_17_1","unstructured":"Brown Tom B. Benjamin Mann Nick Ryder Melanie Subbiah Jared Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell Sandhini Agarwal Ariel Herbert-Voss Gretchen Krueger T. J. Henighan Rewon Child Aditya Ramesh Daniel M. Ziegler Jeff Wu Clemens Winter Christopher Hesse Mark Chen Eric Sigler Mateusz Litwin Scott Gray Benjamin Chess Jack Clark Christopher Berner Sam McCandlish Alec Radford Ilya Sutskever and Dario Amodei. \u201cLanguage Models are Few-Shot Learners.\u201d ArXiv abs\/2005.14165 (2020): n. pag."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3177320"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3202574"}],"event":{"name":"CVDL 2024: The International Conference on Computer Vision and Deep Learning","location":"Changsha China","acronym":"CVDL 2024"},"container-title":["Proceedings of the International Conference on Computer Vision and Deep Learning"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3653804.3654608","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3653804.3654608","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T15:27:02Z","timestamp":1755876422000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3653804.3654608"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,1,19]]},"references-count":19,"alternative-id":["10.1145\/3653804.3654608","10.1145\/3653804"],"URL":"https:\/\/doi.org\/10.1145\/3653804.3654608","relation":{},"subject":[],"published":{"date-parts":[[2024,1,19]]},"assertion":[{"value":"2024-06-01","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}