{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,17]],"date-time":"2026-03-17T00:55:18Z","timestamp":1773708918358,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":59,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681024","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"2389-2398","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Multi-Modal Inductive Framework for Text-Video Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1612-4644","authenticated-orcid":false,"given":"Qian","family":"Li","sequence":"first","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-9883-5621","authenticated-orcid":false,"given":"Yucheng","family":"Zhou","sequence":"additional","affiliation":[{"name":"University of Macau, Macau, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2513-3822","authenticated-orcid":false,"given":"Cheng","family":"Ji","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7013-6913","authenticated-orcid":false,"given":"Feihong","family":"Lu","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-3449-759X","authenticated-orcid":false,"given":"Jianian","family":"Gong","sequence":"additional","affiliation":[{"name":"Beijing University of Aeronautics and Astronautics, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7245-1298","authenticated-orcid":false,"given":"Shangguang","family":"Wang","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5152-0055","authenticated-orcid":false,"given":"Jianxin","family":"Li","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Max Bain Arsha Nagrani G\u00fcl Varol and Andrew Zisserman. 2021. Frozen in Time: A Joint Video and Image Encoder for End-to-End Retrieval. In ICCV. 1708--1718.","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"e_1_3_2_1_2_1","unstructured":"Xiaojun Chang Yi Yang Alexander Hauptmann Eric P Xing and Yao-Liang Yu. 2015. Semantic concept discovery for large-scale zero-shot event detection. In Twenty-fourth international joint conference on artificial intelligence."},{"key":"e_1_3_2_1_3_1","unstructured":"David Chen and William B Dolan. 2011. Collecting highly parallel data for paraphrase evaluation. In ACL. 190--200."},{"key":"e_1_3_2_1_4_1","volume-title":"Improving Video-Text Retrieval by Multi-Stream Corpus Alignment and Dual Softmax Loss. CoRR","author":"Cheng Xing","year":"2021","unstructured":"Xing Cheng, Hezheng Lin, Xiangyu Wu, Fan Yang, and Dong Shen. 2021. Improving Video-Text Retrieval by Multi-Stream Corpus Alignment and Dual Softmax Loss. CoRR, Vol. abs\/2109.04290 (2021). [arXiv]2109.04290"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Anoop Cherian Chiori Hori Tim K. Marks and Jonathan Le Roux. 2022. (2.51)D Spatio-Temporal Scene Graphs for Video Question Answering. In AAAI. 444--453.","DOI":"10.1609\/aaai.v36i1.19922"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Ioana Croitoru Simion-Vlad Bogolin Marius Leordeanu Hailin Jin Andrew Zisserman Samuel Albanie and Yang Liu. 2021. TeachText: CrossModal Generalized Distillation for Text-Video Retrieval. In ICCV. 11563--11573.","DOI":"10.1109\/ICCV48922.2021.01138"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3150959"},{"key":"e_1_3_2_1_8_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly Jakob Uszkoreit and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In ICLR."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Alex Falcon Giuseppe Serra and Oswald Lanz. 2022. A feature-space multimodal data augmentation technique for text-video retrieval. In ACM MM. 4385--4394.","DOI":"10.1145\/3503161.3548365"},{"key":"e_1_3_2_1_10_1","volume-title":"UATVR: Uncertainty-Adaptive Text-Video Retrieval. In ICCV. 13677--13687.","author":"Fang Bo","year":"2023","unstructured":"Bo Fang, Wenhao Wu, Chang Liu, Yu Zhou, Yuxin Song, Weiping Wang, Xiangbo Shu, Xiangyang Ji, and Jingdong Wang. 2023. UATVR: Uncertainty-Adaptive Text-Video Retrieval. In ICCV. 13677--13687."},{"key":"e_1_3_2_1_11_1","volume-title":"CLIP2Video: Mastering Video-Text Retrieval via Image CLIP. CoRR","author":"Fang Han","year":"2021","unstructured":"Han Fang, Pengfei Xiong, Luhui Xu, and Yu Chen. 2021. CLIP2Video: Mastering Video-Text Retrieval via Image CLIP. CoRR, Vol. abs\/2106.11097 (2021). [arXiv]2106.11097"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i2.27941"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Xiang Fang Daizong Liu Pan Zhou and Guoshun Nan. 2023. You can ground earlier than see: An effective and efficient pipeline for temporal sentence grounding in compressed videos. In CVPR. 2448--2460.","DOI":"10.1109\/CVPR52729.2023.00242"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","unstructured":"Xiang Fang Zeyu Xiong Wanlong Fang Xiaoye Qu Chen Chen Jianfeng Dong Keke Tang Pan Zhou Yu Cheng and Daizong Liu. 2024. Rethinking Weakly-supervised Video Temporal Grounding From a Game Perspective. In ECCV.","DOI":"10.1007\/978-3-031-72995-9_17"},{"key":"e_1_3_2_1_15_1","first-page":"214","article-title":"Multi-modal Transformer for Video Retrieval","volume":"12349","author":"Gabeur Valentin","year":"2020","unstructured":"Valentin Gabeur, Chen Sun, Karteek Alahari, and Cordelia Schmid. 2020. Multi-modal Transformer for Video Retrieval. In ECCV, Vol. 12349. 214--229.","journal-title":"ECCV"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Satya Krishna Gorti No\u00ebl Vouitsis Junwei Ma Keyvan Golestan Maksims Volkovs Animesh Garg and Guangwei Yu. 2022. X-Pool: Cross-Modal Language-Video Attention for Text-Video Retrieval. In CVPR. 4996--5005.","DOI":"10.1109\/CVPR52688.2022.00495"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ipm.2019.102067"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","unstructured":"Aman Gupta Sirjan Kafle Di Wen Dylan Wang Sumit Srivastava Suhit Sinha Nikita Gupta Bharat Jain Ananth Sankar and Liang Zhang. 2020. Image and Video Understanding for Recommendation and Spam Detection Systems. In KDD. 3577--3578.","DOI":"10.1145\/3394486.3406485"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"Amirhossein Habibian Thomas Mensink and Cees GM Snoek. 2014. Composite concept discovery for zero-shot video event detection. In ICMR. 17--24.","DOI":"10.1145\/2578726.2578746"},{"key":"e_1_3_2_1_20_1","volume-title":"Visual spatio-temporal relation-enhanced network for cross-modal text-video retrieval. arXiv preprint arXiv:2110.15609","author":"Han Ning","year":"2021","unstructured":"Ning Han, Jingjing Chen, Guangyi Xiao, Yawen Zeng, Chuhao Shi, and Hao Chen. 2021. Visual spatio-temporal relation-enhanced network for cross-modal text-video retrieval. arXiv preprint arXiv:2110.15609 (2021)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"Fabian Caba Heilbron Victor Escorcia Bernard Ghanem and Juan Carlos Niebles. 2015. ActivityNet: A large-scale video benchmark for human activity understanding. In CVPR. 961--970.","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"e_1_3_2_1_22_1","volume-title":"Russell","author":"Hendricks Lisa Anne","year":"2017","unstructured":"Lisa Anne Hendricks, Oliver Wang, Eli Shechtman, Josef Sivic, Trevor Darrell, and Bryan C. Russell. 2017. Localizing Moments in Video with Natural Language. In ICCV. 5804--5813."},{"key":"e_1_3_2_1_23_1","volume-title":"Yang","author":"Hong Sungeun","year":"2018","unstructured":"Sungeun Hong, Woobin Im, and Hyun S. Yang. 2018. CBVMR: Content-Based Video-Music Retrieval Using Soft Intra-Modal Structure Constraint. In ICMR (Yokohama, Japan). New York, NY, USA, 353--361."},{"key":"e_1_3_2_1_24_1","unstructured":"Peng Jin Jinfa Huang Fenglin Liu Xian Wu Shen Ge Guoli Song David A. Clifton and Jie Chen. 2022. Expectation-Maximization Contrastive Learning for Compact Video-and-Language Representations. In NeurIPS."},{"key":"e_1_3_2_1_25_1","volume-title":"Text-Video Retrieval with Disentangled Conceptualization and Set-to-Set Alignment. CoRR","author":"Jin Peng","year":"2023","unstructured":"Peng Jin, Hao Li, Zesen Cheng, Jinfa Huang, Zhennan Wang, Li Yuan, Chang Liu, and Jie Chen. 2023. Text-Video Retrieval with Disentangled Conceptualization and Set-to-Set Alignment. CoRR, Vol. abs\/2305.12218 (2023). [arXiv]2305.12218"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Jie Lei Linjie Li Luowei Zhou Zhe Gan Tamara L. Berg Mohit Bansal and Jingjing Liu. 2021. Less Is More: ClipBERT for Video-and-Language Learning via Sparse Sampling. In CVPR. 7331--7341.","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"e_1_3_2_1_27_1","volume-title":"HERO: Hierarchical Encoder for VideoLanguage Omni-representation Pre-training. In EMNLP. 2046--2065.","author":"Li Linjie","year":"2020","unstructured":"Linjie Li, Yen-Chun Chen, Yu Cheng, Zhe Gan, Licheng Yu, and Jingjing Liu. 2020. HERO: Hierarchical Encoder for VideoLanguage Omni-representation Pre-training. In EMNLP. 2046--2065."},{"key":"e_1_3_2_1_28_1","unstructured":"Pandeng Li Chen-Wei Xie Liming Zhao Hongtao Xie Jiannan Ge Yun Zheng Deli Zhao and Yongdong Zhang. 2023. Progressive spatio-temporal prototype matching for text-video retrieval. In ICCV. 4100--4110."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Qian Li Lixin Su Jiashu Zhao Long Xia Hengyi Cai Suqi Cheng Hengzhu Tang Junfeng Wang and Dawei Yin. 2024. Text-Video Retrieval via Multi-Modal Hypergraph Networks. In WSDM. 369--377.","DOI":"10.1145\/3616855.3635757"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"Ke Liang Lingyuan Meng Meng Liu Yue Liu Wenxuan Tu Siwei Wang Sihang Zhou and Xinwang Liu. 2023. Learn from relational correlations and periodic events for temporal knowledge graph reasoning. In ACM SIGIR. 1559--1568.","DOI":"10.1145\/3539618.3591711"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.01.094"},{"key":"e_1_3_2_1_32_1","volume-title":"Visual instruction tuning. arXiv preprint arXiv:2304.08485","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual instruction tuning. arXiv preprint arXiv:2304.08485 (2023)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"Song Liu Haoqi Fan Shengsheng Qian Yiru Chen Wenkui Ding and Zhongyuan Wang. 2021. HiT: Hierarchical Transformer with Momentum Contrast for Video-Text Retrieval. In ICCV. 11895--11905.","DOI":"10.1109\/ICCV48922.2021.01170"},{"key":"e_1_3_2_1_34_1","unstructured":"Yang Liu Samuel Albanie Arsha Nagrani and Andrew Zisserman. 2019. Use What You Have: Video retrieval using representations from collaborative experts. In BMVC. 279."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"crossref","unstructured":"Yu Liu Huai Chen Lianghua Huang Di Chen Bin Wang Pan Pan and Lisheng Wang. 2022. Animating Images to Transfer CLIP for Video-Text Retrieval. In SIGIR. 1906--1911.","DOI":"10.1145\/3477495.3531776"},{"key":"e_1_3_2_1_36_1","first-page":"319","article-title":"TS2-Net: Token Shift and Selection Transformer for Text-Video Retrieval","volume":"13674","author":"Liu Yuqi","year":"2022","unstructured":"Yuqi Liu, Pengfei Xiong, Luhui Xu, Shengming Cao, and Qin Jin. 2022. TS2-Net: Token Shift and Selection Transformer for Text-Video Retrieval. In ECCV, Vol. 13674. 319--335.","journal-title":"ECCV"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"e_1_3_2_1_38_1","unstructured":"Yiwei Ma Guohai Xu Xiaoshuai Sun Ming Yan Ji Zhang and Rongrong Ji. 2022. X-CLIP: End-to-End Multi-grained Contrastive Learning for Video-Text Retrieval. In ACM MM. 638--647."},{"key":"e_1_3_2_1_39_1","volume-title":"Florian Metze, Alexander G. Hauptmann, Jo ao F. Henriques, and Andrea Vedaldi.","author":"Patrick Mandela","year":"2021","unstructured":"Mandela Patrick, Po-Yao Huang, Yuki Markus Asano, Florian Metze, Alexander G. Hauptmann, Jo ao F. Henriques, and Andrea Vedaldi. 2021. Support-set bottlenecks for video-text representation learning. In ICLR."},{"key":"e_1_3_2_1_40_1","volume-title":"CLIPPING: Distilling CLIP-Based Models with a Student Base for Video-Language Retrieval. In CVPR. 18983--18992.","author":"Pei Renjing","year":"2023","unstructured":"Renjing Pei, Jianzhuang Liu, Weimian Li, Bin Shao, Songcen Xu, Peng Dai, Juwei Lu, and Youliang Yan. 2023. CLIPPING: Distilling CLIP-Based Models with a Student Base for Video-Language Retrieval. In CVPR. 18983--18992."},{"key":"e_1_3_2_1_41_1","volume-title":"Machine Learning for Visualization Recommendation Systems: Open Challenges and Future Directions. CoRR","author":"Podo Luca","year":"2023","unstructured":"Luca Podo, Bardh Prenkaj, and Paola Velardi. 2023. Machine Learning for Visualization Recommendation Systems: Open Challenges and Future Directions. CoRR, Vol. abs\/2302.00569 (2023). [arXiv]2302.00569"},{"key":"e_1_3_2_1_42_1","first-page":"8748","article-title":"Learning Transferable Visual Models From Natural Language Supervision","volume":"139","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In ICML, Vol. 139. 8748--8763.","journal-title":"ICML"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0987-1"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00559"},{"key":"e_1_3_2_1_45_1","volume-title":"UMG-CLIP: A Unified Multi-Granularity Vision Generalist for Open-World Understanding. CoRR","author":"Shi Bowen","year":"2024","unstructured":"Bowen Shi, Peisen Zhao, Zichen Wang, Yuhang Zhang, Yaoming Wang, Jin Li, Wenrui Dai, Junni Zou, Hongkai Xiong, Qi Tian, and Xiaopeng Zhang. 2024. UMG-CLIP: A Unified Multi-Granularity Vision Generalist for Open-World Understanding. CoRR, Vol. abs\/2401.06397 (2024). [arXiv]2401.06397"},{"key":"e_1_3_2_1_46_1","volume-title":"HowToCaption: Prompting LLMs to Transform Video Annotations at Scale. arXiv preprint arXiv:2310.04900","author":"Shvetsova Nina","year":"2023","unstructured":"Nina Shvetsova, Anna Kukleva, Xudong Hong, Christian Rupprecht, Bernt Schiele, and Hilde Kuehne. 2023. HowToCaption: Prompting LLMs to Transform Video Annotations at Scale. arXiv preprint arXiv:2310.04900 (2023)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"crossref","unstructured":"Xiaohan Wang Linchao Zhu and Yi Yang. 2021. T2vlad: global-local sequence alignment for text-video retrieval. In CVPR. 5079--5088.","DOI":"10.1109\/CVPR46437.2021.00504"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"crossref","unstructured":"Xiaohan Wang Linchao Zhu and Yi Yang. 2021. T2VLAD: Global-Local Sequence Alignment for Text-Video Retrieval. In CVPR. 5079--5088.","DOI":"10.1109\/CVPR46437.2021.00504"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"crossref","unstructured":"Wenhao Wu Haipeng Luo Bo Fang Jingdong Wang and Wanli Ouyang. 2023. Cap4Video: What Can Auxiliary Captions Do for Text-Video Retrieval?. In CVPR. 10704--10713.","DOI":"10.1109\/CVPR52729.2023.01031"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"crossref","unstructured":"Junbin Xiao Angela Yao Zhiyuan Liu Yicong Li Wei Ji and Tat-Seng Chua. 2022. Video as Conditional Graph Hierarchy for Multi-Granular Question Answering. In AAAI. 2804--2812.","DOI":"10.1609\/aaai.v36i3.20184"},{"key":"e_1_3_2_1_51_1","volume-title":"Msr-vtt: A large video description dataset for bridging video and language. In CVPR. 5288--5296.","author":"Xu Jun","year":"2016","unstructured":"Jun Xu, Tao Mei, Ting Yao, and Yong Rui. 2016. Msr-vtt: A large video description dataset for bridging video and language. In CVPR. 5288--5296."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"crossref","unstructured":"Konstantin Yakovlev Gregory Polyakov Ilseyar Alimova Alexander Podolskiy Andrey Bout Sergey Nikolenko and Irina Piontkovskaya. 2023. Sinkhorn Transformations for Single-Query Postprocessing in Text-Video Retrieval. In SIGIR. 2394--2398.","DOI":"10.1145\/3539618.3592064"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"crossref","unstructured":"Jianwei Yang Yonatan Bisk and Jianfeng Gao. 2021. TACo: Token-aware Cascade Contrastive Learning for Video-Text Alignment. In ICCV. 11542--11552.","DOI":"10.1109\/ICCV48922.2021.01136"},{"key":"e_1_3_2_1_54_1","volume-title":"DGL: Dynamic Global-Local Prompt Tuning for Text-Video Retrieval. In AAAI. 6540--6548.","author":"Yang Xiangpeng","year":"2024","unstructured":"Xiangpeng Yang, Linchao Zhu, Xiaohan Wang, and Yi Yang. 2024. DGL: Dynamic Global-Local Prompt Tuning for Text-Video Retrieval. In AAAI. 6540--6548."},{"key":"e_1_3_2_1_55_1","volume-title":"Classification and Retrieval of Multimedia Audio Learning Resources. iJET","author":"Zhang Wenwen","year":"2023","unstructured":"Wenwen Zhang. 2023. Classification and Retrieval of Multimedia Audio Learning Resources. iJET, Vol. 18, 20 (2023), 99--113."},{"key":"e_1_3_2_1_56_1","volume-title":"Knowledgeable Preference Alignment for LLMs in Domain-specific Question Answering. CoRR","author":"Zhang Yichi","year":"2023","unstructured":"Yichi Zhang, Zhuo Chen, Yin Fang, Lei Cheng, Yanxi Lu, Fangming Li, Wen Zhang, and Huajun Chen. 2023. Knowledgeable Preference Alignment for LLMs in Domain-specific Question Answering. CoRR, Vol. abs\/2311.06503 (2023)."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"crossref","unstructured":"Shuai Zhao Linchao Zhu Xiaohan Wang and Yi Yang. 2022. CenterCLIP: Token Clustering for Efficient Text-Video Retrieval. In SIGIR. 970--981.","DOI":"10.1145\/3477495.3531950"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"crossref","unstructured":"Yue Zhao Ishan Misra Philipp Kr\u00e4henb\u00fchl and Rohit Girdhar. 2023. Learning Video Representations from Large Language Models. In CVPR. 6586--6597.","DOI":"10.1109\/CVPR52729.2023.00637"},{"key":"e_1_3_2_1_59_1","volume-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592","author":"Zhu Deyao","year":"2023","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2023. Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681024","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681024","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:37Z","timestamp":1750295857000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681024"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":59,"alternative-id":["10.1145\/3664647.3681024","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681024","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}