{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,19]],"date-time":"2026-06-19T06:48:12Z","timestamp":1781851692058,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":67,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681358","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:49Z","timestamp":1729925989000},"page":"866-875","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["HICEScore: A Hierarchical Metric for Image Captioning Evaluation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-3410-8513","authenticated-orcid":false,"given":"Zequn","family":"Zeng","sequence":"first","affiliation":[{"name":"National Key Laboratory of Radar Signal Processing, Xidian University, Xi'an, Shaanxi, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8193-7940","authenticated-orcid":false,"given":"Jianqiao","family":"Sun","sequence":"additional","affiliation":[{"name":"National Key Laboratory of Radar Signal Processing, Xidian University, Xi'an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2928-2692","authenticated-orcid":false,"given":"Hao","family":"Zhang","sequence":"additional","affiliation":[{"name":"National Key Laboratory of Radar Signal Processing, Xidian University, Xi'an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-7911-7694","authenticated-orcid":false,"given":"Tiansheng","family":"Wen","sequence":"additional","affiliation":[{"name":"National Key Laboratory of Radar Signal Processing, Xidian University, Xi'an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-4938-0683","authenticated-orcid":false,"given":"Yudi","family":"Su","sequence":"additional","affiliation":[{"name":"National Key Laboratory of Radar Signal Processing, Xidian university, Xi'an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2929-8636","authenticated-orcid":false,"given":"Yan","family":"Xie","sequence":"additional","affiliation":[{"name":"National Key Laboratory of Radar Signal Processing, Xidian University, Xi'an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1846-495X","authenticated-orcid":false,"given":"Zhengjue","family":"Wang","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Integrated Service Networks, Xidian University, Xi'an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5151-9388","authenticated-orcid":false,"given":"Bo","family":"Chen","sequence":"additional","affiliation":[{"name":"National Key Laboratory of Radar Signal Processing, Xidian University, Xi'an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"From images to sentences through scene description graphs using commonsense reasoning and knowledge. arXiv preprint arXiv:1511.03292","author":"Aditya Somak","year":"2015","unstructured":"Somak Aditya, Yezhou Yang, Chitta Baral, Cornelia Fermuller, and Yiannis Aloimonos. 2015. From images to sentences through scene description graphs using commonsense reasoning and knowledge. arXiv preprint arXiv:1511.03292 (2015)."},{"key":"e_1_3_2_1_2_1","volume-title":"Abbas Sharifi, Ali Tarlani Beris, Mohammadsadegh Nouri, and Amir Sharifzadeh Javidi.","author":"Ahmadi Mohsen","year":"2023","unstructured":"Mohsen Ahmadi, Ahmad Gholizadeh Lonbar, Abbas Sharifi, Ali Tarlani Beris, Mohammadsadegh Nouri, and Amir Sharifzadeh Javidi. 2023. Application of segment anything model for civil infrastructure defect assessment. arXiv preprint arXiv:2304.12600 (2023)."},{"key":"e_1_3_2_1_3_1","volume-title":"Proceedings, Part V 14","author":"Anderson Peter","year":"2016","unstructured":"Peter Anderson, Basura Fernando, Mark Johnson, and Stephen Gould. 2016. Spice: Semantic propositional image caption evaluation. In Computer Vision--ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11--14, 2016, Proceedings, Part V 14. Springer, 382--398."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"e_1_3_2_1_5_1","volume-title":"Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65--72","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65--72."},{"key":"e_1_3_2_1_6_1","volume-title":"Intelligent and Fuzzy Techniques: Smart and Innovative Solutions: Proceedings of the INFUS 2020 Conference","author":"Cayli \u00d6zkan","year":"2020","unstructured":"\u00d6zkan cCayli, Burak Makav, Volkan Kilicc, and Aytuug Onan. 2021. Mobile application based automatic caption generation for visually impaired. In Intelligent and Fuzzy Techniques: Smart and Innovative Solutions: Proceedings of the INFUS 2020 Conference, Istanbul, Turkey, July 21--23, 2020. Springer, 1532--1539."},{"key":"e_1_3_2_1_7_1","volume-title":"Saliency-based spatiotemporal attention for video captioning. In 2018 IEEE fourth international conference on multimedia big data (BigMM)","author":"Chen Yangyu","unstructured":"Yangyu Chen, Weigang Zhang, Shuhui Wang, Liang Li, and Qingming Huang. 2018. Saliency-based spatiotemporal attention for video captioning. In 2018 IEEE fourth international conference on multimedia big data (BigMM). IEEE, 1--8."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00608"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/P14-2074"},{"key":"e_1_3_2_1_11_1","volume-title":"Segment anything model (sam) meets glass: Mirror and transparent objects cannot be easily detected. arXiv preprint arXiv:2305.00278","author":"Han Dongsheng","year":"2023","unstructured":"Dongsheng Han, Chaoning Zhang, Yu Qiao, Maryam Qamar, Yuna Jung, SeungKyu Lee, Sung-Ho Bae, and Choong Seon Hong. 2023. Segment anything model (sam) meets glass: Mirror and transparent objects cannot be easily detected. arXiv preprint arXiv:2305.00278 (2023)."},{"key":"e_1_3_2_1_12_1","volume-title":"Ronan Le Bras, and Yejin Choi","author":"Hessel Jack","year":"2021","unstructured":"Jack Hessel, Ari Holtzman, Maxwell Forbes, Ronan Le Bras, and Yejin Choi. 2021. Clipscore: A reference-free evaluation metric for image captioning. arXiv preprint arXiv:2104.08718 (2021)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.5555\/2566972.2566993"},{"key":"e_1_3_2_1_14_1","volume-title":"InfoMetIC: An Informative Metric for Reference-free Image Caption Evaluation. arXiv preprint arXiv:2305.06002","author":"Hu Anwen","year":"2023","unstructured":"Anwen Hu, Shizhe Chen, Liang Zhang, and Qin Jin. 2023. InfoMetIC: An Informative Metric for Reference-free Image Caption Evaluation. arXiv preprint arXiv:2305.06002 (2023)."},{"key":"e_1_3_2_1_15_1","volume-title":"Expansionnet v2: Block static expansion in fast end to end training for image captioning. arXiv preprint arXiv:2208.06551","author":"Hu Jia Cheng","year":"2022","unstructured":"Jia Cheng Hu, Roberto Cavicchioli, and Alessandro Capotondi. 2022. Expansionnet v2: Block static expansion in fast end to end training for image captioning. arXiv preprint arXiv:2208.06551 (2022)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1220"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_2_1_18_1","volume-title":"Proceedings of naacL-HLT","volume":"1","author":"Ming-Wei Chang Jacob Devlin","year":"2019","unstructured":"Jacob Devlin Ming-Wei Chang Kenton and Lee Kristina Toutanova. 2019. Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of naacL-HLT, Vol. 1. 2."},{"key":"e_1_3_2_1_19_1","volume-title":"Re-evaluating automatic metrics for image captioning. arXiv preprint arXiv:1612.07600","author":"Kilickaya Mert","year":"2016","unstructured":"Mert Kilickaya, Aykut Erdem, Nazli Ikizler-Cinbis, and Erkut Erdem. 2016. Re-evaluating automatic metrics for image captioning. arXiv preprint arXiv:1612.07600 (2016)."},{"key":"e_1_3_2_1_20_1","first-page":"35072","article-title":"Mutual information divergence: A unified metric for multimodal generative models","volume":"35","author":"Kim Jin-Hwa","year":"2022","unstructured":"Jin-Hwa Kim, Yunji Kim, Jiyoung Lee, Kang Min Yoo, and Sang-Woo Lee. 2022. Mutual information divergence: A unified metric for multimodal generative models. Advances in Neural Information Processing Systems, Vol. 35 (2022), 35072--35086.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_2_1_22_1","volume-title":"UMIC: An unreferenced metric for image captioning via contrastive learning. arXiv preprint arXiv:2106.14019","author":"Lee Hwanhee","year":"2021","unstructured":"Hwanhee Lee, Seunghyun Yoon, Franck Dernoncourt, Trung Bui, and Kyomin Jung. 2021. UMIC: An unreferenced metric for image captioning via contrastive learning. arXiv preprint arXiv:2106.14019 (2021)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.eval4nlp-1.4"},{"key":"e_1_3_2_1_24_1","volume-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)."},{"key":"e_1_3_2_1_25_1","volume-title":"International Conference on Machine Learning. PMLR, 12888--12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International Conference on Machine Learning. PMLR, 12888--12900."},{"key":"e_1_3_2_1_26_1","volume-title":"Lizhen Qu, Gholamreza Haffari, Fei Li, Donghong Ji, and Quan Hung Tran.","author":"Li Zhuang","year":"2023","unstructured":"Zhuang Li, Yuyang Chai, Terry Zhuo Yue, Lizhen Qu, Gholamreza Haffari, Fei Li, Donghong Ji, and Quan Hung Tran. 2023. FACTUAL: A Benchmark for Faithful and Consistent Textual Scene Graph Parsing. arXiv preprint arXiv:2305.17497 (2023)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.3115\/1220355.1220427"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_29_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2024. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41467-024-44824-z"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v31i1.11237"},{"key":"e_1_3_2_1_32_1","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311--318","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311--318."},{"key":"e_1_3_2_1_33_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.5555\/1866696.1866717"},{"key":"e_1_3_2_1_35_1","volume-title":"Kaylee Burns, Trevor Darrell, and Kate Saenko.","author":"Rohrbach Anna","year":"2018","unstructured":"Anna Rohrbach, Lisa Anne Hendricks, Kaylee Burns, Trevor Darrell, and Kate Saenko. 2018. Object hallucination in image captioning. arXiv preprint arXiv:1809.02156 (2018)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00668"},{"key":"e_1_3_2_1_37_1","volume-title":"Laion-400m: Open dataset of clip-filtered 400 million image-text pairs. arXiv preprint arXiv:2111.02114","author":"Schuhmann Christoph","year":"2021","unstructured":"Christoph Schuhmann, Richard Vencu, Romain Beaumont, Robert Kaczmarczyk, Clayton Mullis, Aarush Katta, Theo Coombes, Jenia Jitsev, and Aran Komatsuzaki. 2021. Laion-400m: Open dataset of clip-filtered 400 million image-text pairs. arXiv preprint arXiv:2111.02114 (2021)."},{"key":"e_1_3_2_1_38_1","volume-title":"Foil it! find one mismatch between image and language caption. arXiv preprint arXiv:1705.01359","author":"Shekhar Ravi","year":"2017","unstructured":"Ravi Shekhar, Sandro Pezzelle, Yauhen Klimovich, Aur\u00e9lie Herbelot, Moin Nabi, Enver Sangineto, and Raffaella Bernardi. 2017. Foil it! find one mismatch between image and language caption. arXiv preprint arXiv:1705.01359 (2017)."},{"key":"e_1_3_2_1_39_1","volume-title":"Anything-3d: Towards single-view anything reconstruction in the wild. arXiv preprint arXiv:2304.10261","author":"Shen Qiuhong","year":"2023","unstructured":"Qiuhong Shen, Xingyi Yang, and Xinchao Wang. 2023. Anything-3d: Towards single-view anything reconstruction in the wild. arXiv preprint arXiv:2304.10261 (2023)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.3390\/diagnostics13111947"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01740"},{"key":"e_1_3_2_1_42_1","volume-title":"SnapCap: Efficient Snapshot Compressive Video Captioning. arXiv preprint arXiv:2401.04903","author":"Sun Jianqiao","year":"2024","unstructured":"Jianqiao Sun, Yudi Su, Hao Zhang, Ziheng Cheng, Zequn Zeng, Zhengjue Wang, Bo Chen, and Xin Yuan. 2024. SnapCap: Efficient Snapshot Compressive Video Captioning. arXiv preprint arXiv:2401.04903 (2024)."},{"key":"e_1_3_2_1_43_1","volume-title":"Alpha-CLIP: A clip model focusing on wherever you want. arXiv preprint arXiv:2312.03818","author":"Sun Zeyi","year":"2023","unstructured":"Zeyi Sun, Ye Fang, Tong Wu, Pan Zhang, Yuhang Zang, Shu Kong, Yuanjun Xiong, Dahua Lin, and Jiaqi Wang. 2023. Alpha-CLIP: A clip model focusing on wherever you want. arXiv preprint arXiv:2312.03818 (2023)."},{"key":"e_1_3_2_1_44_1","volume-title":"Can sam segment anything? when sam meets camouflaged object detection. arXiv preprint arXiv:2304.04709","author":"Tang Lv","year":"2023","unstructured":"Lv Tang, Haoke Xiao, and Bo Li. 2023. Can sam segment anything? when sam meets camouflaged object detection. arXiv preprint arXiv:2304.04709 (2023)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/2213836.2213951"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.515"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2964299"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01383"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20160"},{"key":"e_1_3_2_1_52_1","first-page":"17287","article-title":"Visual clues: Bridging vision and language foundations for image paragraph captioning","volume":"35","author":"Xie Yujia","year":"2022","unstructured":"Yujia Xie, Luowei Zhou, Xiyang Dai, Lu Yuan, Nguyen Bach, Ce Liu, and Michael Zeng. 2022. Visual clues: Bridging vision and language foundations for image paragraph captioning. Advances in Neural Information Processing Systems, Vol. 35 (2022), 17287--17300.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_53_1","volume-title":"International conference on machine learning. PMLR","author":"Xu Kelvin","year":"2015","unstructured":"Kelvin Xu, Jimmy Ba, Ryan Kiros, Kyunghyun Cho, Aaron Courville, Ruslan Salakhudinov, Rich Zemel, and Yoshua Bengio. 2015. Show, attend and tell: Neural image caption generation with visual attention. In International conference on machine learning. PMLR, 2048--2057."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.10"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.93"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.503"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01337"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02247"},{"key":"e_1_3_2_1_60_1","volume-title":"Sung-Ho Bae, Seungkyu Lee, and Choong Seon Hong.","author":"Zhang Chaoning","year":"2023","unstructured":"Chaoning Zhang, Dongshen Han, Yu Qiao, Jung Uk Kim, Sung-Ho Bae, Seungkyu Lee, and Choong Seon Hong. 2023. Faster segment anything: Towards lightweight sam for mobile applications. arXiv preprint arXiv:2306.14289 (2023)."},{"key":"e_1_3_2_1_61_1","volume-title":"Mobilesamv2: Faster segment anything to everything. arXiv preprint arXiv:2312.09579","author":"Zhang Chaoning","year":"2023","unstructured":"Chaoning Zhang, Dongshen Han, Sheng Zheng, Jinwoo Choi, Tae-Ho Kim, and Choong Seon Hong. 2023. Mobilesamv2: Faster segment anything to everything. arXiv preprint arXiv:2312.09579 (2023)."},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"e_1_3_2_1_63_1","volume-title":"How segment anything model (SAM) boost medical image segmentation? arXiv preprint arXiv:2305.03678","author":"Zhang Yichi","year":"2023","unstructured":"Yichi Zhang and Rushi Jiao. 2023. How segment anything model (SAM) boost medical image segmentation? arXiv preprint arXiv:2305.03678 (2023)."},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"e_1_3_2_1_65_1","volume-title":"Can sam segment polyps? arXiv preprint arXiv:2304.07583","author":"Zhou Tao","year":"2023","unstructured":"Tao Zhou, Yizhe Zhang, Yi Zhou, Ye Wu, and Chen Gong. 2023. Can sam segment polyps? arXiv preprint arXiv:2304.07583 (2023)."},{"key":"e_1_3_2_1_66_1","volume-title":"Chatgpt asks, blip-2 answers: Automatic questioning towards enriched visual descriptions. arXiv preprint arXiv:2303.06594","author":"Zhu Deyao","year":"2023","unstructured":"Deyao Zhu, Jun Chen, Kilichbek Haydarov, Xiaoqian Shen, Wenxuan Zhang, and Mohamed Elhoseiny. 2023. Chatgpt asks, blip-2 answers: Automatic questioning towards enriched visual descriptions. arXiv preprint arXiv:2303.06594 (2023)."},{"key":"e_1_3_2_1_67_1","volume-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592","author":"Zhu Deyao","year":"2023","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2023. Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681358","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681358","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:44Z","timestamp":1750295864000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681358"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":67,"alternative-id":["10.1145\/3664647.3681358","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681358","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}