{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,16]],"date-time":"2026-06-16T15:12:10Z","timestamp":1781622730070,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,10,10]],"date-time":"2022-10-10T00:00:00Z","timestamp":1665360000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2021ZD0110700"],"award-info":[{"award-number":["2021ZD0110700"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U19B2043, 61976185"],"award-info":[{"award-number":["U19B2043, 61976185"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Zhejiang Natural Science Foundation","award":["LR19F020002"],"award-info":[{"award-number":["LR19F020002"]}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["226-2022-00087"],"award-info":[{"award-number":["226-2022-00087"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Zhejiang Innovation Foundation","award":["2019R52002"],"award-info":[{"award-number":["2019R52002"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,10,10]]},"DOI":"10.1145\/3503161.3548358","type":"proceedings-article","created":{"date-parts":[[2022,10,10]],"date-time":"2022-10-10T15:43:12Z","timestamp":1665416592000},"page":"4374-4384","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":21,"title":["Rethinking the Reference-based Distinctive Image Captioning"],"prefix":"10.1145","author":[{"given":"Yangjun","family":"Mao","sequence":"first","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Long","family":"Chen","sequence":"additional","affiliation":[{"name":"Columbia University, New York, NY, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhihong","family":"Jiang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Dong","family":"Zhang","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhimeng","family":"Zhang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jian","family":"Shao","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jun","family":"Xiao","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2022,10,10]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"crossref","unstructured":"Peter Anderson Xiaodong He Chris Buehler Damien Teney Mark Johnson Stephen Gould and Lei Zhang. 2018. Bottom-up and top-down attention for image captioning and visual question answering. In CVPR.  Peter Anderson Xiaodong He Chris Buehler Damien Teney Mark Johnson Stephen Gould and Lei Zhang. 2018. Bottom-up and top-down attention for image captioning and visual question answering. In CVPR.","DOI":"10.1109\/CVPR.2018.00636"},{"key":"e_1_3_2_2_2_1","unstructured":"J. L. Ba J. R. Kiros and G. E. Hinton. 2016. Layer Normalization. (2016).  J. L. Ba J. R. Kiros and G. E. Hinton. 2016. Layer Normalization. (2016)."},{"key":"e_1_3_2_2_3_1","volume-title":"ACL workshop.","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee and Alon Lavie . 2005 . METEOR: An automatic metric for MT evaluation with improved correlation with human judgments . In ACL workshop. Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In ACL workshop."},{"key":"e_1_3_2_2_4_1","volume-title":"Groupcap: Group-based image captioning with structured relevance and diversity constraints. In CVPR. 1345--1353.","author":"Chen Fuhai","year":"2018","unstructured":"Fuhai Chen , Rongrong Ji , Xiaoshuai Sun , Yongjian Wu , and Jinsong Su . 2018 . Groupcap: Group-based image captioning with structured relevance and diversity constraints. In CVPR. 1345--1353. Fuhai Chen, Rongrong Ji, Xiaoshuai Sun, Yongjian Wu, and Jinsong Su. 2018. Groupcap: Group-based image captioning with structured relevance and diversity constraints. In CVPR. 1345--1353."},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01657"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"crossref","unstructured":"Long Chen Hanwang Zhang Jun Xiao Xiangnan He Shiliang Pu and Shih-Fu Chang. 2019. Counterfactual critic multi-agent training for scene graph generation. In ICCV.  Long Chen Hanwang Zhang Jun Xiao Xiangnan He Shiliang Pu and Shih-Fu Chang. 2019. Counterfactual critic multi-agent training for scene graph generation. In ICCV.","DOI":"10.1109\/ICCV.2019.00471"},{"key":"e_1_3_2_2_7_1","volume-title":"Sca-cnn: Spatial and channel-wise attention in convolutional networks for image captioning. In CVPR.","author":"Chen Long","year":"2017","unstructured":"Long Chen , Hanwang Zhang , Jun Xiao , Liqiang Nie , Jian Shao , Wei Liu , and Tat-Seng Chua . 2017 . Sca-cnn: Spatial and channel-wise attention in convolutional networks for image captioning. In CVPR. Long Chen, Hanwang Zhang, Jun Xiao, Liqiang Nie, Jian Shao, Wei Liu, and Tat-Seng Chua. 2017. Sca-cnn: Spatial and channel-wise attention in convolutional networks for image captioning. In CVPR."},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"crossref","unstructured":"Shizhe Chen Qin Jin Peng Wang and Qi Wu. 2020. Say As You Wish: Fine-grained Control of Image Caption Generation with Abstract Scene Graphs. In CVPR.  Shizhe Chen Qin Jin Peng Wang and Qi Wu. 2020. Say As You Wish: Fine-grained Control of Image Caption Generation with Abstract Scene Graphs. In CVPR.","DOI":"10.1109\/CVPR42600.2020.00998"},{"key":"e_1_3_2_2_9_1","unstructured":"X. Chen H. Fang T. Y. Lin R. Vedantam S. Gupta P. Dollar and C. L. Zitnick. 2015. Microsoft COCO Captions: Data Collection and Evaluation Server. arXiv (2015).  X. Chen H. Fang T. Y. Lin R. Vedantam S. Gupta P. Dollar and C. L. Zitnick. 2015. Microsoft COCO Captions: Data Collection and Evaluation Server. arXiv (2015)."},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"crossref","unstructured":"M. Cornia M. Stefanini L. Baraldi and R. Cucchiara. 2020. Meshed-Memory Transformer for Image Captioning. In CVPR.  M. Cornia M. Stefanini L. Baraldi and R. Cucchiara. 2020. Meshed-Memory Transformer for Image Captioning. In CVPR.","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"crossref","unstructured":"Bo Dai Sanja Fidler Raquel Urtasun and Dahua Lin. 2017. Towards diverse and natural image descriptions via a conditional gan. In ICCV.  Bo Dai Sanja Fidler Raquel Urtasun and Dahua Lin. 2017. Towards diverse and natural image descriptions via a conditional gan. In ICCV.","DOI":"10.1109\/ICCV.2017.323"},{"key":"e_1_3_2_2_12_1","unstructured":"Bo Dai and Dahua Lin. 2017. Contrastive learning for image captioning. In NeurIPS.  Bo Dai and Dahua Lin. 2017. Contrastive learning for image captioning. In NeurIPS."},{"key":"e_1_3_2_2_13_1","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep residual learning for image recognition. In CVPR.  Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep residual learning for image recognition. In CVPR."},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"crossref","unstructured":"Lun Huang Wenmin Wang Jie Chen and Xiao-Yong Wei. 2019. Attention on attention for image captioning. In ICCV. 4634--4643.  Lun Huang Wenmin Wang Jie Chen and Xiao-Yong Wei. 2019. Attention on attention for image captioning. In ICCV. 4634--4643.","DOI":"10.1109\/ICCV.2019.00473"},{"key":"e_1_3_2_2_15_1","volume-title":"Densecap: Fully convolutional localization networks for dense captioning. In CVPR. 4565--4574.","author":"Johnson Justin","year":"2016","unstructured":"Justin Johnson , Andrej Karpathy , and Li Fei-Fei . 2016 . Densecap: Fully convolutional localization networks for dense captioning. In CVPR. 4565--4574. Justin Johnson, Andrej Karpathy, and Li Fei-Fei. 2016. Densecap: Fully convolutional localization networks for dense captioning. In CVPR. 4565--4574."},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"crossref","unstructured":"Andrej Karpathy and Li Fei-Fei. 2015. Deep visual-semantic alignments for generating image descriptions. In CVPR.  Andrej Karpathy and Li Fei-Fei. 2015. Deep visual-semantic alignments for generating image descriptions. In CVPR.","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_2_2_17_1","unstructured":"Guang Li Linchao Zhu Ping Liu and Yi Yang. 2019. Entangled transformer for image captioning. In ICCV. 8928--8937.  Guang Li Linchao Zhu Ping Liu and Yi Yang. 2019. Entangled transformer for image captioning. In ICCV. 8928--8937."},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01830"},{"key":"e_1_3_2_2_19_1","unstructured":"Zhuowan Li Quan Tran Long Mai Zhe Lin and Alan L Yuille. 2020. Context-aware group captioning via self-attention and contrastive features. In CVPR. 3440--3450.  Zhuowan Li Quan Tran Long Mai Zhe Lin and Alan L Yuille. 2020. Context-aware group captioning via self-attention and contrastive features. In CVPR. 3440--3450."},{"key":"e_1_3_2_2_20_1","volume-title":"ACL workshop.","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin . 2004 . Rouge: A package for automatic evaluation of summaries . In ACL workshop. Chin-Yew Lin. 2004. Rouge: A package for automatic evaluation of summaries. In ACL workshop."},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2021.3086066"},{"key":"e_1_3_2_2_22_1","volume-title":"Region-Aware Image Captioning via Interaction Learning","author":"Liu An-An","year":"2021","unstructured":"An-An Liu , Yingchen Zhai , Ning Xu , Weizhi Nie , Wenhui Li , and Yongdong Zhang . 2021b. Region-Aware Image Captioning via Interaction Learning . IEEE Transactions on Circuits and Systems for Video Technology ( 2021 ). An-An Liu, Yingchen Zhai, Ning Xu, Weizhi Nie, Wenhui Li, and Yongdong Zhang. 2021b. Region-Aware Image Captioning via Interaction Learning. IEEE Transactions on Circuits and Systems for Video Technology (2021)."},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"crossref","unstructured":"Lixin Liu Jiajun Tang Xiaojun Wan and Zongming Guo. 2019. Generating Diverse and Descriptive Image Captions Using Visual Paraphrases. In ICCV.  Lixin Liu Jiajun Tang Xiaojun Wan and Zongming Guo. 2019. Generating Diverse and Descriptive Image Captions Using Visual Paraphrases. In ICCV.","DOI":"10.1109\/ICCV.2019.00434"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"crossref","unstructured":"Siqi Liu Zhenhai Zhu Ning Ye Sergio Guadarrama and Kevin Murphy. 2017. Improved image captioning via policy gradient optimization of spider. In ICCV. 873--881.  Siqi Liu Zhenhai Zhu Ning Ye Sergio Guadarrama and Kevin Murphy. 2017. Improved image captioning via policy gradient optimization of spider. In ICCV. 873--881.","DOI":"10.1109\/ICCV.2017.100"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"crossref","unstructured":"Xihui Liu Hongsheng Li Jing Shao Dapeng Chen and Xiaogang Wang. 2018. Show tell and discriminate: Image captioning by self-retrieval with partially labeled data. In ECCV.  Xihui Liu Hongsheng Li Jing Shao Dapeng Chen and Xiaogang Wang. 2018. Show tell and discriminate: Image captioning by self-retrieval with partially labeled data. In ECCV.","DOI":"10.1007\/978-3-030-01267-0_21"},{"key":"e_1_3_2_2_26_1","unstructured":"Ruotian Luo Brian Price Scott Cohen and Gregory Shakhnarovich. 2018. Discriminability objective for training descriptive captions. In CVPR.  Ruotian Luo Brian Price Scott Cohen and Gregory Shakhnarovich. 2018. Discriminability objective for training descriptive captions. In CVPR."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"crossref","unstructured":"Kishore Papineni Salim Roukos Todd Ward and Wei-Jing Zhu. 2002. BLEU: a method for automatic evaluation of machine translation. In ACL.  Kishore Papineni Salim Roukos Todd Ward and Wei-Jing Zhu. 2002. BLEU: a method for automatic evaluation of machine translation. In ACL.","DOI":"10.3115\/1073083.1073135"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"crossref","unstructured":"Dong Huk Park Trevor Darrell and Anna Rohrbach. 2019. Robust change captioning. In ICCV. 4624--4633.  Dong Huk Park Trevor Darrell and Anna Rohrbach. 2019. Robust change captioning. In ICCV. 4624--4633.","DOI":"10.1109\/ICCV.2019.00472"},{"key":"e_1_3_2_2_29_1","volume-title":"Entities: Collecting Region-to-Phrase Correspondences for Richer Image-to-Sentence Models. In ICCV.","author":"Plummer B. A.","year":"2016","unstructured":"B. A. Plummer , L. Wang , C. M. Cervantes , J. C. Caicedo , and S. Lazebnik . 2016 . Flickr 30k Entities: Collecting Region-to-Phrase Correspondences for Richer Image-to-Sentence Models. In ICCV. B. A. Plummer, L. Wang, C. M. Cervantes, J. C. Caicedo, and S. Lazebnik. 2016. Flickr30k Entities: Collecting Region-to-Phrase Correspondences for Richer Image-to-Sentence Models. In ICCV."},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"crossref","unstructured":"Yue Qiu Shintaro Yamamoto Kodai Nakashima Ryota Suzuki Kenji Iwata Hirokatsu Kataoka and Yutaka Satoh. 2021. Describing and Localizing Multiple Changes with Transformers. In ICCV. 1971--1980.  Yue Qiu Shintaro Yamamoto Kodai Nakashima Ryota Suzuki Kenji Iwata Hirokatsu Kataoka and Yutaka Satoh. 2021. Describing and Localizing Multiple Changes with Transformers. In ICCV. 1971--1980.","DOI":"10.1109\/ICCV48922.2021.00198"},{"key":"e_1_3_2_2_31_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford , Jong Wook Kim , Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021 . Learning transferable visual models from natural language supervision. In ICML. 8748--8763. Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In ICML. 8748--8763."},{"key":"e_1_3_2_2_32_1","volume-title":"Sequence level training with recurrent neural networks. arXiv","author":"Ranzato Marc'Aurelio","year":"2015","unstructured":"Marc'Aurelio Ranzato , Sumit Chopra , Michael Auli , and Wojciech Zaremba . 2015. Sequence level training with recurrent neural networks. arXiv ( 2015 ). Marc'Aurelio Ranzato, Sumit Chopra, Michael Auli, and Wojciech Zaremba. 2015. Sequence level training with recurrent neural networks. arXiv (2015)."},{"key":"e_1_3_2_2_33_1","volume-title":"Faster r-cnn: Towards real-time object detection with region proposal networks","author":"Ren Shaoqing","year":"2016","unstructured":"Shaoqing Ren , Kaiming He , Ross Girshick , and Jian Sun . 2016. Faster r-cnn: Towards real-time object detection with region proposal networks . IEEE TPAMI ( 2016 ). Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. 2016. Faster r-cnn: Towards real-time object detection with region proposal networks. IEEE TPAMI (2016)."},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"crossref","unstructured":"Steven J Rennie Etienne Marcheret Youssef Mroueh Jerret Ross and Vaibhava Goel. 2017. Self-critical sequence training for image captioning. In CVPR.  Steven J Rennie Etienne Marcheret Youssef Mroueh Jerret Ross and Vaibhava Goel. 2017. Self-critical sequence training for image captioning. In CVPR.","DOI":"10.1109\/CVPR.2017.131"},{"key":"e_1_3_2_2_35_1","volume-title":"Mario Fritz, and Bernt Schiele.","author":"Shetty Rakshith","year":"2017","unstructured":"Rakshith Shetty , Marcus Rohrbach , Lisa Anne Hendricks , Mario Fritz, and Bernt Schiele. 2017 . Speaking the same language: Matching machine to human captions by adversarial training. In ICCV. Rakshith Shetty, Marcus Rohrbach, Lisa Anne Hendricks, Mario Fritz, and Bernt Schiele. 2017. Speaking the same language: Matching machine to human captions by adversarial training. In ICCV."},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"crossref","unstructured":"Alane Suhr Stephanie Zhou Ally Zhang Iris Zhang Huajun Bai and Yoav Artzi. 2019. A corpus for reasoning about natural language grounded in photographs. (2019).  Alane Suhr Stephanie Zhou Ally Zhang Iris Zhang Huajun Bai and Yoav Artzi. 2019. A corpus for reasoning about natural language grounded in photographs. (2019).","DOI":"10.18653\/v1\/P19-1644"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"crossref","unstructured":"Hao Tan Franck Dernoncourt Zhe Lin Trung Bui and Mohit Bansal. 2019. Expressing visual relationships via language. (2019).  Hao Tan Franck Dernoncourt Zhe Lin Trung Bui and Mohit Bansal. 2019. Expressing visual relationships via language. (2019).","DOI":"10.18653\/v1\/P19-1182"},{"key":"e_1_3_2_2_38_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. In NeurIPS.  Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. In NeurIPS."},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"crossref","unstructured":"Ramakrishna Vedantam Samy Bengio Kevin Murphy Devi Parikh and Gal Chechik. 2017. Context-aware captions from context-agnostic supervision. In CVPR.  Ramakrishna Vedantam Samy Bengio Kevin Murphy Devi Parikh and Gal Chechik. 2017. Context-aware captions from context-agnostic supervision. In CVPR.","DOI":"10.1109\/CVPR.2017.120"},{"key":"e_1_3_2_2_40_1","volume-title":"Cider: Consensus-based image description evaluation. In CVPR.","author":"Vedantam Ramakrishna","year":"2015","unstructured":"Ramakrishna Vedantam , C Lawrence Zitnick , and Devi Parikh . 2015 . Cider: Consensus-based image description evaluation. In CVPR. Ramakrishna Vedantam, C Lawrence Zitnick, and Devi Parikh. 2015. Cider: Consensus-based image description evaluation. In CVPR."},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"crossref","unstructured":"Oriol Vinyals Alexander Toshev Samy Bengio and Dumitru Erhan. 2015. Show and tell: A neural image caption generator. In CVPR.  Oriol Vinyals Alexander Toshev Samy Bengio and Dumitru Erhan. 2015. Show and tell: A neural image caption generator. In CVPR.","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"crossref","unstructured":"Jiuniu Wang Wenjia Xu Qingzhong Wang and Antoni B Chan. 2020b. Compare and Reweight: Distinctive Image Captioning Using Similar Images Sets. In ECCV.  Jiuniu Wang Wenjia Xu Qingzhong Wang and Antoni B Chan. 2020b. Compare and Reweight: Distinctive Image Captioning Using Similar Images Sets. In ECCV.","DOI":"10.1007\/978-3-030-58452-8_22"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"crossref","unstructured":"Jiuniu Wang Wenjia Xu Qingzhong Wang and Antoni B Chan. 2021b. Group-based distinctive image captioning with memory attention. In ACM MM. 5020--5028.  Jiuniu Wang Wenjia Xu Qingzhong Wang and Antoni B Chan. 2021b. Group-based distinctive image captioning with memory attention. In ACM MM. 5020--5028.","DOI":"10.1145\/3474085.3475215"},{"key":"e_1_3_2_2_44_1","volume-title":"High-Order Interaction Learning for Image Captioning","author":"Wang Yanhui","year":"2021","unstructured":"Yanhui Wang , Ning Xu , An-An Liu , Wenhui Li , and Yongdong Zhang . 2021a. High-Order Interaction Learning for Image Captioning . IEEE Transactions on Circuits and Systems for Video Technology ( 2021 ). Yanhui Wang, Ning Xu, An-An Liu, Wenhui Li, and Yongdong Zhang. 2021a. High-Order Interaction Learning for Image Captioning. IEEE Transactions on Circuits and Systems for Video Technology (2021)."},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"crossref","unstructured":"Zeyu Wang Berthy Feng Karthik Narasimhan and Olga Russakovsky. 2020a. Towards Unique and Informative Captioning of Images. In ECCV.  Zeyu Wang Berthy Feng Karthik Narasimhan and Olga Russakovsky. 2020a. Towards Unique and Informative Captioning of Images. In ECCV.","DOI":"10.1007\/978-3-030-58571-6_37"},{"key":"e_1_3_2_2_46_1","unstructured":"Kelvin Xu Jimmy Ba Ryan Kiros Kyunghyun Cho Aaron Courville Ruslan Salakhudinov Rich Zemel and Yoshua Bengio. 2015. Show attend and tell: Neural image caption generation with visual attention. In ICML.  Kelvin Xu Jimmy Ba Ryan Kiros Kyunghyun Cho Aaron Courville Ruslan Salakhudinov Rich Zemel and Yoshua Bengio. 2015. Show attend and tell: Neural image caption generation with visual attention. In ICML."},{"key":"e_1_3_2_2_47_1","volume-title":"Scene graph inference via multi-scale context modeling","author":"Xu Ning","year":"2020","unstructured":"Ning Xu , An-An Liu , Yongkang Wong , Weizhi Nie , Yuting Su , and Mohan Kankanhalli . 2020. Scene graph inference via multi-scale context modeling . IEEE Transactions on Circuits and Systems for Video Technology ( 2020 ), 1031--1041. Ning Xu, An-An Liu, Yongkang Wong, Weizhi Nie, Yuting Su, and Mohan Kankanhalli. 2020. Scene graph inference via multi-scale context modeling. IEEE Transactions on Circuits and Systems for Video Technology (2020), 1031--1041."},{"key":"e_1_3_2_2_48_1","volume-title":"Multi-level policy and reward-based deep reinforcement learning framework for image captioning","author":"Xu Ning","year":"2019","unstructured":"Ning Xu , Hanwang Zhang , An-An Liu , Weizhi Nie , Yuting Su , Jie Nie , and Yongdong Zhang . 2019. Multi-level policy and reward-based deep reinforcement learning framework for image captioning . IEEE Transactions on Multimedia ( 2019 ), 1372--1383. Ning Xu, Hanwang Zhang, An-An Liu, Weizhi Nie, Yuting Su, Jie Nie, and Yongdong Zhang. 2019. Multi-level policy and reward-based deep reinforcement learning framework for image captioning. IEEE Transactions on Multimedia (2019), 1372--1383."},{"key":"e_1_3_2_2_49_1","volume-title":"Tsu-Jui Fu, and William Yang Wang.","author":"Yan An","year":"2021","unstructured":"An Yan , Xin Eric Wang , Tsu-Jui Fu, and William Yang Wang. 2021 . L2C: Describing Visual Differences Needs Semantic Understanding of Individuals . (2021). An Yan, Xin Eric Wang, Tsu-Jui Fu, and William Yang Wang. 2021. L2C: Describing Visual Differences Needs Semantic Understanding of Individuals. (2021)."},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"crossref","unstructured":"Xu Yang Kaihua Tang Hanwang Zhang and Jianfei Cai. 2019. Auto-encoding scene graphs for image captioning. In CVPR. 10685--10694.  Xu Yang Kaihua Tang Hanwang Zhang and Jianfei Cai. 2019. Auto-encoding scene graphs for image captioning. In CVPR. 10685--10694.","DOI":"10.1109\/CVPR.2019.01094"},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"crossref","unstructured":"Ting Yao Yingwei Pan Yehao Li and Tao Mei. 2018. Exploring visual relationship for image captioning. In ECCV. 684--699.  Ting Yao Yingwei Pan Yehao Li and Tao Mei. 2018. Exploring visual relationship for image captioning. In ECCV. 684--699.","DOI":"10.1007\/978-3-030-01264-9_42"}],"event":{"name":"MM '22: The 30th ACM International Conference on Multimedia","location":"Lisboa Portugal","acronym":"MM '22","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 30th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503161.3548358","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3503161.3548358","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:00:44Z","timestamp":1750186844000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503161.3548358"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,10,10]]},"references-count":51,"alternative-id":["10.1145\/3503161.3548358","10.1145\/3503161"],"URL":"https:\/\/doi.org\/10.1145\/3503161.3548358","relation":{},"subject":[],"published":{"date-parts":[[2022,10,10]]},"assertion":[{"value":"2022-10-10","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}