{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T05:04:40Z","timestamp":1750309480844,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":35,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3688991","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"11414-11419","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["A Method for Visual Spatial Description Based on Large Language Model Fine-tuning"],"prefix":"10.1145","author":[{"given":"Jiabao","family":"Wang","sequence":"first","affiliation":[{"name":"School of Electrical Engineering, Guangxi University, Nanning, China"}]},{"given":"Fang","family":"Gao","sequence":"additional","affiliation":[{"name":"School of Electrical Engineering, Guangxi University, Nanning, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8441-8562","authenticated-orcid":false,"given":"Jingfeng","family":"Tang","sequence":"additional","affiliation":[{"name":"School of Electrical Engineering, Guangxi University, Nanning, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5034-8721","authenticated-orcid":false,"given":"Shaodong","family":"Li","sequence":"additional","affiliation":[{"name":"School of Electrical Engineering, Guangxi University, Nanning, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7660-7293","authenticated-orcid":false,"given":"Hanbo","family":"Zheng","sequence":"additional","affiliation":[{"name":"School of Electrical Engineering, Guangxi University, Nanning, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-8470-8733","authenticated-orcid":false,"given":"Shengheng","family":"Ma","sequence":"additional","affiliation":[{"name":"Guangxi China-Tek Blue Valley Semiconductor Technology Co., Ltd., Nanning, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4733-4732","authenticated-orcid":false,"given":"Feng","family":"Shuang","sequence":"additional","affiliation":[{"name":"School of Electrical Engineering, Guangxi University, Nanning, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3197-8103","authenticated-orcid":false,"given":"Jun","family":"Yu","sequence":"additional","affiliation":[{"name":"Department of Automation, University of Science and Technology of China, Hefei, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"volume-title":"PMLR","author":"Xu K","key":"e_1_3_2_1_1_1","unstructured":"Xu K, Ba J, Kiros R, et al. 2015. Show, attend and tell: Neural image caption generation with visual attention[C]\/\/International conference on machine learning. PMLR, 2048--2057."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.3047929"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Diao H Zhang Y Ma L et al. 2021. Similarity reasoning and filtration for image-text matching[C]\/\/Proceedings of the AAAI conference on artificial intelligence. 35(2): 1218--1226.","DOI":"10.1609\/aaai.v35i2.16209"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2021.3071581"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMC.2019.2955653"},{"key":"e_1_3_2_1_6_1","first-page":"1","article-title":"SAR ship target recognition via multiscale feature attention and adaptive-weighed classifier[J]","volume":"20","author":"Wang C","year":"2023","unstructured":"Wang C, Pei J, Luo S, et al. 2023. SAR ship target recognition via multiscale feature attention and adaptive-weighed classifier[J]. IEEE Geoscience and Remote Sensing Letters, 20: 1--5.","journal-title":"IEEE Geoscience and Remote Sensing Letters"},{"key":"e_1_3_2_1_7_1","volume-title":"Newsclippings: Automatic generation of out-of-context multimodal media[J]. arXiv preprint arXiv:2104.05893.","author":"Luo G","year":"2021","unstructured":"Luo G, Darrell T, Rohrbach A. 2021. Newsclippings: Automatic generation of out-of-context multimodal media[J]. arXiv preprint arXiv:2104.05893."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Cross-modal text and visual generation: A systematic review. Part 1: Image to text[J]. Information Fusion 93: 302--329.","DOI":"10.1016\/j.inffus.2023.01.008"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Zhao Y Wei J Lin Z et al. 2022. Visual spatial description: Controlled spatial-oriented image-to-text generation[J]. arXiv preprint arXiv:2210.11109.","DOI":"10.18653\/v1\/2022.emnlp-main.93"},{"key":"e_1_3_2_1_10_1","unstructured":"Li J Selvaraju R Gotmare A et al. 2021. Align before fuse: Vision and language representation learning with momentum distillation[J]. Advances in neural information processing systems 34: 9694--9705."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"Vinyals O Toshev A Bengio S et al. 2015. Show and tell: A neural image caption generator[C]\/\/Proceedings of the IEEE conference on computer vision and pattern recognition. 3156--3164.","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"e_1_3_2_1_12_1","unstructured":"Vaswani A Shazeer N Parmar N et al. 2017. Attention is all you need[J]. Advances in neural information processing systems 6000--6010."},{"key":"e_1_3_2_1_13_1","volume-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks[J]. Advances in neural information processing systems, 32.","author":"Lu J","year":"2019","unstructured":"Lu J, Batra D, Parikh D, et al. 2019. Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks[J]. Advances in neural information processing systems, 32."},{"key":"e_1_3_2_1_14_1","volume-title":"Videobert: A joint model for video and language representation learning[C]\/\/Proceedings of the IEEE\/CVF international conference on computer vision. 7464--7473.","author":"Sun C","year":"2019","unstructured":"Sun C, Myers A, Vondrick C, et al. 2019. Videobert: A joint model for video and language representation learning[C]\/\/Proceedings of the IEEE\/CVF international conference on computer vision. 7464--7473."},{"key":"e_1_3_2_1_15_1","volume-title":"Lxmert: Learning cross-modality encoder representations from transformers[J]. arXiv preprint arXiv:1908.07490.","author":"Tan H","year":"2019","unstructured":"Tan H, Bansal M. 2019. Lxmert: Learning cross-modality encoder representations from transformers[J]. arXiv preprint arXiv:1908.07490."},{"key":"e_1_3_2_1_16_1","volume-title":"Visualbert: A simple and performant baseline for vision and language[J]. arXiv preprint arXiv:1908.03557.","author":"Li L H","year":"2019","unstructured":"Li L H, Yatskar M, Yin D, et al. 2019. Visualbert: A simple and performant baseline for vision and language[J]. arXiv preprint arXiv:1908.03557."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"Cornia M Stefanini M Baraldi L et al. 2020. Meshed-memory transformer for image captioning[C]\/\/Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 10578--10587.","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","unstructured":"Zhou L Palangi H Zhang L et al. 2020. Unified vision-language pre-training for image captioning and vqa[C]\/\/Proceedings of the AAAI conference on artificial intelligence. 34(07): 13041--13049.","DOI":"10.1609\/aaai.v34i07.7005"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"Fei H Ren Y Wu S et al. 2021. Latent target-opinion as prior for document-level sentiment classification: A variational approach from fine-grained perspective[C]\/\/Proceedings of the web conference 2021. 553--564.","DOI":"10.1145\/3442381.3449789"},{"key":"e_1_3_2_1_20_1","unstructured":"Ronghang H Amanpreet S. 2021. Transformer is all you need: Multimodal multitask learning with a unified transformer[J]. arXiv preprint arXiv:2102.10772."},{"key":"e_1_3_2_1_21_1","volume-title":"Unimo: Towards unified-modal understanding and generation via cross-modal contrastive learning[J]. arXiv preprint arXiv:2012.15409.","author":"Li W","year":"2020","unstructured":"Li W, Gao C, Niu G, et al. 2020. Unimo: Towards unified-modal understanding and generation via cross-modal contrastive learning[J]. arXiv preprint arXiv:2012.15409."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Karpathy A Fei-Fei L. 2015. Deep visual-semantic alignments for generating image descriptions[C]\/\/Proceedings of the IEEE conference on computer vision and pattern recognition. 3128--3137.","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_2_1_23_1","volume-title":"Van Der Maaten L, et al","author":"Johnson J","year":"2017","unstructured":"Johnson J, Hariharan B, Van Der Maaten L, et al. 2017. Clevr: A diagnostic dataset for compositional language and elementary visual reasoning[C]\/\/Proceedings of the IEEE conference on computer vision and pattern recognition. 2901--2910."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Agrawal A Batra D Parikh D et al. 2018. Don't just assume; look and answer: Overcoming priors for visual question answering[C]\/\/Proceedings of the IEEE conference on computer vision and pattern recognition. 4971--4980.","DOI":"10.1109\/CVPR.2018.00522"},{"key":"e_1_3_2_1_25_1","volume-title":"Gqa: A new dataset for real-world visual reasoning and compositional question answering[C]\/\/Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 6700--6709.","author":"Hudson D A","year":"2019","unstructured":"Hudson D A, Manning C D. 2019. Gqa: A new dataset for real-world visual reasoning and compositional question answering[C]\/\/Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 6700--6709."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Chen L Jiang Z Xiao J et al. 2021. Human-like controllable image captioning with verb-specific semantic roles[C]\/\/Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 16846--16856.","DOI":"10.1109\/CVPR46437.2021.01657"},{"key":"e_1_3_2_1_27_1","volume-title":"Spatialsense: An adversarially crowdsourced benchmark for spatial relation recognition[C]\/\/Proceedings of the IEEE\/CVF International Conference on Computer Vision. 2051--2060.","author":"Yang K","year":"2019","unstructured":"Yang K, Russakovsky O, Deng J. 2019. Spatialsense: An adversarially crowdsourced benchmark for spatial relation recognition[C]\/\/Proceedings of the IEEE\/CVF International Conference on Computer Vision. 2051--2060."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2021.3069041"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Yu W Zhu C Qin L et al. 2022. Diversifying content generation for commonsense reasoning with mixture of knowledge graph experts[J]. arXiv preprint arXiv:2203.07285.","DOI":"10.18653\/v1\/2022.findings-acl.149"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"Zhao Y Fei H Ji W et al. 2023. Generating visual spatial description via holistic 3D scene understanding[J]. arXiv preprint arXiv:2305.11768.","DOI":"10.18653\/v1\/2023.acl-long.442"},{"key":"e_1_3_2_1_31_1","volume-title":"Bliva: A simple multimodal llm for better handling of text-rich visual questions[C]\/\/Proceedings of the AAAI Conference on Artificial Intelligence. 38(3): 2256--2264.","author":"Hu W","year":"2024","unstructured":"Hu W, Xu Y, Li Y, et al. 2024. Bliva: A simple multimodal llm for better handling of text-rich visual questions[C]\/\/Proceedings of the AAAI Conference on Artificial Intelligence. 38(3): 2256--2264."},{"issue":"70","key":"e_1_3_2_1_32_1","first-page":"1","article-title":"Scaling instruction-finetuned language models[J]","volume":"25","author":"Chung H W","year":"2024","unstructured":"Chung H W, Hou L, Longpre S, et al. 2024. Scaling instruction-finetuned language models[J]. Journal of Machine Learning Research, 25(70): 1--53.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_33_1","volume-title":"Eva-clip: Improved training techniques for clip at scale[J]. arXiv preprint arXiv:2303.15389.","author":"Sun Q","year":"2023","unstructured":"Sun Q, Fang Y, Wu L, et al. 2023. Eva-clip: Improved training techniques for clip at scale[J]. arXiv preprint arXiv:2303.15389."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"crossref","unstructured":"Papineni K Roukos S Ward T et al. 2002. Bleu: a method for automatic evaluation of machine translation[C]\/\/Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311--318.","DOI":"10.3115\/1073083.1073135"},{"key":"e_1_3_2_1_35_1","volume-title":"Spice: Semantic propositional image caption evaluation[C]\/\/Computer Vision--ECCV 2016: 14th European Conference","author":"Anderson P","year":"2016","unstructured":"Anderson P, Fernando B, Johnson M, et al. 2016. Spice: Semantic propositional image caption evaluation[C]\/\/Computer Vision--ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11--14, 2016, Proceedings, Part V 14. Springer International Publishing, 382--398."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3688991","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3688991","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:29Z","timestamp":1750295849000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3688991"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":35,"alternative-id":["10.1145\/3664647.3688991","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3688991","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}