{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T16:30:46Z","timestamp":1773246646533,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Guangdong Basic and Applied Basic Research Foundation","award":["2024A1515012027"],"award-info":[{"award-number":["2024A1515012027"]}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62376069"],"award-info":[{"award-number":["62376069"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Young Elite Scientists Sponsorship Program by CAST","award":["2023QNRC001"],"award-info":[{"award-number":["2023QNRC001"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681453","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"4148-4157","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Differential-Perceptive and Retrieval-Augmented MLLM for Change Captioning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-0279-7934","authenticated-orcid":false,"given":"Xian","family":"Zhang","sequence":"first","affiliation":[{"name":"Harbin Institute of Technology (Shenzhen), Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0633-3722","authenticated-orcid":false,"given":"Haokun","family":"Wen","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology (Shenzhen), Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0247-5221","authenticated-orcid":false,"given":"Jianlong","family":"Wu","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology (Shenzhen), Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-9043-3220","authenticated-orcid":false,"given":"Pengda","family":"Qin","sequence":"additional","affiliation":[{"name":"Alibaba Group, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2093-2839","authenticated-orcid":false,"given":"Hui","family":"Xue'","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1476-0273","authenticated-orcid":false,"given":"Liqiang","family":"Nie","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology (Shenzhen), Shenzhen, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3154154"},{"key":"e_1_3_2_1_2_1","volume-title":"Proceedings of the Advances in Neural Information Processing Systems.","author":"Alayrac Jean-Baptiste","year":"2022","unstructured":"Jean-Baptiste Alayrac, Jeff Donahue, Pauline Luc, Antoine Miech, Iain Barr, Yana Hasson, Karel Lenc, Arthur Mensch, Katherine Millican, Malcolm Reynolds, Roman Ring, Eliza Rutherford, Serkan Cabi, Tengda Han, Zhitao Gong, Sina Samangooei, Marianne Monteiro, Jacob L. Menick, Sebastian Borgeaud, Andy Brock, Aida Nematzadeh, Sahand Sharifzadeh, Mikolaj Binkowski, Ricardo Barreira, Oriol Vinyals, Andrew Zisserman, and Kar\u00e9n Simonyan. 2022. Flamingo: a Visual Language Model for Few-Shot Learning. In Proceedings of the Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"e_1_3_2_1_4_1","volume-title":"Proceedings of the Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization. ACL, 65--72","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An Automatic Metric for MT Evaluation with Improved Correlation with Human Judgments. In Proceedings of the Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization. ACL, 65--72."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3328224"},{"key":"e_1_3_2_1_6_1","volume-title":"Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality. See https:\/\/vicuna. lmsys. org","author":"Chiang Wei-Lin","year":"2023","unstructured":"Wei-Lin Chiang, Zhuohan Li, Zi Lin, Ying Sheng, Zhanghao Wu, Hao Zhang, Lianmin Zheng, Siyuan Zhuang, Yonghao Zhuang, Joseph E Gonzalez, et al. 2023. Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality. See https:\/\/vicuna. lmsys. org, Vol. 2, 3 (2023), 6."},{"key":"e_1_3_2_1_7_1","volume-title":"Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven C. H. Hoi.","author":"Dai Wenliang","year":"2023","unstructured":"Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven C. H. Hoi. 2023. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. In Proceedings of the Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01855"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3613773"},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of the Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the International Joint Conference on Natural Language Processing. ACL, 33--42","author":"Guo Zixin","year":"2022","unstructured":"Zixin Guo, Tzu-Jui Julius Wang, and Jorma Laaksonen. 2022. CLIP4IDC: CLIP for Image Difference Captioning. In Proceedings of the Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the International Joint Conference on Natural Language Processing. ACL, 33--42."},{"key":"e_1_3_2_1_11_1","volume-title":"Proceedings of the International Conference on Machine Learning. PMLR, 3929--3938","author":"Guu Kelvin","year":"2020","unstructured":"Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat, and Ming-Wei Chang. 2020. Retrieval Augmented Language Model Pre-Training. In Proceedings of the International Conference on Machine Learning. PMLR, 3929--3938."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00275"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3074803"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1436"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00210"},{"key":"e_1_3_2_1_16_1","volume-title":"Proceedings of the Advances in Neural Information Processing Systems.","author":"Lewis Patrick S. H.","year":"2020","unstructured":"Patrick S. H. Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K\u00fcttler, Mike Lewis, Wen-tau Yih, Tim Rockt\u00e4schel, Sebastian Riedel, and Douwe Kiela. 2020. Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. In Proceedings of the Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_17_1","volume-title":"Proceedings of the International Conference on Machine Learning. PMLR","author":"Li Junnan","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven C. H. Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. In Proceedings of the International Conference on Machine Learning. PMLR, 19730--19742."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00325"},{"key":"e_1_3_2_1_19_1","volume-title":"Proceedings of the Text Summarization Branches Out. ACL, 74--81","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. ROUGE: A Package for Automatic Evaluation of Summaries. In Proceedings of the Text Summarization Branches Out. ACL, 74--81."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2023.3334492"},{"key":"e_1_3_2_1_21_1","unstructured":"Fenglin Liu Changchang Yin Xian Wu Shen Ge Ping Zhang and Xu Sun. 2021. Contrastive Attention for Automatic Chest X-ray Report Generation. In Findings of the Association for Computational Linguistics. ACL 269--280."},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of the Advances in Neural Information Processing Systems.","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual Instruction Tuning. In Proceedings of the Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_23_1","volume-title":"Proceedings of the International Conference on Learning Representations. OpenReview.net.","author":"Loshchilov Ilya","year":"2019","unstructured":"Ilya Loshchilov and Frank Hutter. 2019. Decoupled Weight Decay Regularization. In Proceedings of the International Conference on Learning Representations. OpenReview.net."},{"key":"e_1_3_2_1_24_1","volume-title":"Proceedings of the Annual Meeting of the Association for Computational Linguistics. ACL, 311--318","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a Method for Automatic Evaluation of Machine Translation. In Proceedings of the Annual Meeting of the Association for Computational Linguistics. ACL, 311--318."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00472"},{"key":"e_1_3_2_1_26_1","volume-title":"Proceedings of the International Conference on Machine Learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In Proceedings of the International Conference on Machine Learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00278"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"Xiangxi Shi Xu Yang Jiuxiang Gu Shafiq R. Joty and Jianfei Cai. 2020. Finding It at Another Side: A Viewpoint-Adapted Matching Encoder for Change Captioning. (2020) 574--590.","DOI":"10.1007\/978-3-030-58568-6_34"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1002\/int.22821"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1182"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3268004"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3254162"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00263"},{"key":"e_1_3_2_1_34_1","volume-title":"Proceedings of the Conference on Empirical Methods in Natural Language Processing. ACL, 9319--9329","author":"Tu Yunbin","year":"2021","unstructured":"Yunbin Tu, Liang Li, Chenggang Yan, Shengxiang Gao, and Zhengtao Yu. 2021. Rtextbackslash^3Net: Relation-embedded Representation Reconstruction Network for Change Captioning. In Proceedings of the Conference on Empirical Methods in Natural Language Processing. ACL, 9319--9329."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"crossref","unstructured":"Yunbin Tu Tingting Yao Liang Li Jiedong Lou Shengxiang Gao Zhengtao Yu and Chenggang Yan. 2021. Semantic Relation-aware Difference Representation Learning for Change Captioning. In Findings of the Association for Computational Linguistics. ACL 63--73.","DOI":"10.18653\/v1\/2021.findings-acl.6"},{"key":"e_1_3_2_1_36_1","volume-title":"Representation Learning with Contrastive Predictive Coding. CoRR","author":"van den Oord A\u00e4ron","year":"2018","unstructured":"A\u00e4ron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation Learning with Contrastive Predictive Coding. CoRR, Vol. abs\/1807.03748 (2018)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611817"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20218"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611830"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3242142"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00971"},{"key":"e_1_3_2_1_43_1","volume-title":"MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models. CoRR","author":"Zhu Deyao","year":"2023","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2023. MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models. CoRR, Vol. abs\/2304.10592 (2023)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681453","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681453","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:57:47Z","timestamp":1750294667000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681453"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":43,"alternative-id":["10.1145\/3664647.3681453","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681453","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}