{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,10]],"date-time":"2026-07-10T03:53:40Z","timestamp":1783655620567,"version":"3.55.0"},"publisher-location":"New York, NY, USA","reference-count":61,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,10,17]],"date-time":"2021-10-17T00:00:00Z","timestamp":1634428800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"NSFC","award":["61972298"],"award-info":[{"award-number":["61972298"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,10,17]]},"DOI":"10.1145\/3474085.3475439","type":"proceedings-article","created":{"date-parts":[[2021,10,18]],"date-time":"2021-10-18T04:59:18Z","timestamp":1634533158000},"page":"2615-2624","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":61,"title":["Dual Graph Convolutional Networks with Transformer and Curriculum Learning for Image Captioning"],"prefix":"10.1145","author":[{"given":"Xinzhi","family":"Dong","sequence":"first","affiliation":[{"name":"Wuhan University, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Chengjiang","family":"Long","sequence":"additional","affiliation":[{"name":"JD Finance America Corporation, Mountain View, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Wenju","family":"Xu","sequence":"additional","affiliation":[{"name":"InnoPeak Technology Inc., Palo Alto, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Chunxia","family":"Xiao","sequence":"additional","affiliation":[{"name":"Wuhan University, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2021,10,17]]},"reference":[{"key":"e_1_3_2_2_1_1","first-page":"6077","volume-title":"Lei Zhang. Bottom-Up and Top-Down Attention for Image Captioning and Visual Question Answering. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR)","author":"Anderson Peter","year":"2018","unstructured":"Peter Anderson , Xiaodong He , Chris Buehler , Damien Teney , Mark Johnson , Stephen Gould , and Lei Zhang. Bottom-Up and Top-Down Attention for Image Captioning and Visual Question Answering. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR) , pages 6077 -- 6086 , 2018 . Peter Anderson, Xiaodong He, Chris Buehler, Damien Teney, Mark Johnson, Stephen Gould, and Lei Zhang. Bottom-Up and Top-Down Attention for Image Captioning and Visual Question Answering. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR), pages 6077--6086, 2018."},{"key":"e_1_3_2_2_2_1","volume-title":"Curriculum learning. page 41--48","author":"Bengio Yoshua","year":"2009","unstructured":"Yoshua Bengio , J\u00e9r\u00f4me Louradour , Ronan Collobert , and Jason Weston . Curriculum learning. page 41--48 . Association for Computing Machinery (ACM) , 2009 . Yoshua Bengio, J\u00e9r\u00f4me Louradour, Ronan Collobert, and Jason Weston. Curriculum learning. page 41--48. Association for Computing Machinery (ACM), 2009."},{"key":"e_1_3_2_2_3_1","first-page":"6298","volume-title":"Tat-Seng Chua. SCA-CNN: Spatial and Channel-wise Attention in Convolutional Networks for Image Captioning. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR)","author":"Chen Long","year":"2017","unstructured":"Long Chen , Hanwang Zhang , Jun Xiao , Liqiang Nie , Jian Shao , Wei Liu , and Tat-Seng Chua. SCA-CNN: Spatial and Channel-wise Attention in Convolutional Networks for Image Captioning. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR) , pages 6298 -- 6306 , 2017 . Long Chen, Hanwang Zhang, Jun Xiao, Liqiang Nie, Jian Shao, Wei Liu, and Tat-Seng Chua. SCA-CNN: Spatial and Channel-wise Attention in Convolutional Networks for Image Captioning. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR), pages 6298--6306, 2017."},{"key":"e_1_3_2_2_4_1","first-page":"8299","volume-title":"Control and Tell: A Framework for Generating Controllable and Grounded Captions. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR)","author":"Cornia Marcella","year":"2019","unstructured":"Marcella Cornia , Lorenzo Baraldi , and Rita Cucchiara . Show , Control and Tell: A Framework for Generating Controllable and Grounded Captions. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR) , pages 8299 -- 8308 , 2019 . Marcella Cornia, Lorenzo Baraldi, and Rita Cucchiara. Show, Control and Tell: A Framework for Generating Controllable and Grounded Captions. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR), pages 8299--8308, 2019."},{"key":"e_1_3_2_2_5_1","first-page":"10578","volume-title":"Rita Cucchiara. Meshed-Memory Transformer for Image Captioning. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR)","author":"Cornia Marcella","year":"2020","unstructured":"Marcella Cornia , Matteo Stefanini , Lorenzo Baraldi , and Rita Cucchiara. Meshed-Memory Transformer for Image Captioning. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR) , pages 10578 -- 10587 , 2020 . Marcella Cornia, Matteo Stefanini, Lorenzo Baraldi, and Rita Cucchiara. Meshed-Memory Transformer for Image Captioning. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR), pages 10578--10587, 2020."},{"key":"e_1_3_2_2_6_1","first-page":"3298","volume-title":"Dahua Lin. Detecting Visual Relationships with Deep Relational Networks. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR)","author":"Dai Bo","year":"2017","unstructured":"Bo Dai , Yuqi Zhang , and Dahua Lin. Detecting Visual Relationships with Deep Relational Networks. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR) , pages 3298 -- 3308 , 2017 . Bo Dai, Yuqi Zhang, and Dahua Lin. Detecting Visual Relationships with Deep Relational Networks. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR), pages 3298--3308, 2017."},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/W14-3348"},{"key":"e_1_3_2_2_8_1","volume-title":"Kenton Lee, and Kristina Toutanova. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin , Ming Wei Chang , Kenton Lee, and Kristina Toutanova. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 , 2018 . Jacob Devlin, Ming Wei Chang, Kenton Lee, and Kristina Toutanova. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805, 2018."},{"key":"e_1_3_2_2_9_1","first-page":"2625","volume-title":"Trevor Darrell. Long-term Recurrent Convolutional Networks for Visual Recognition and Description. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR)","author":"Donahue Jeff","year":"2015","unstructured":"Jeff Donahue , Lisa Anne Hendricks , Sergio Guadarrama , Marcus Rohrbach , Subhashini Venugopalan , Kate Saenko , and Trevor Darrell. Long-term Recurrent Convolutional Networks for Visual Recognition and Description. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR) , pages 2625 -- 2634 , 2015 . Jeff Donahue, Lisa Anne Hendricks, Sergio Guadarrama, Marcus Rohrbach, Subhashini Venugopalan, Kate Saenko, and Trevor Darrell. Long-term Recurrent Convolutional Networks for Visual Recognition and Description. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR), pages 2625--2634, 2015."},{"key":"e_1_3_2_2_10_1","volume-title":"A comprehensive pipeline for complex text-to-image synthesis. Journal of Computer Science and Technology(JCST), 35(3):522--537","author":"Fang Fei","year":"2020","unstructured":"Fei Fang , Fei Luo , Hongpan Zhang , Huajian Zhou , and Chunxia Xiao . A comprehensive pipeline for complex text-to-image synthesis. Journal of Computer Science and Technology(JCST), 35(3):522--537 , 2020 . Fei Fang, Fei Luo, Hongpan Zhang, Huajian Zhou, and Chunxia Xiao. A comprehensive pipeline for complex text-to-image synthesis. Journal of Computer Science and Technology(JCST), 35(3):522--537, 2020."},{"key":"e_1_3_2_2_11_1","article-title":"Narrative collage of image collections by scene graph recombination","author":"Fang Fei","year":"2018","unstructured":"Fei Fang , Miao Yi , Hui Feng , Shenghong Hu , and Chunxia Xiao . Narrative collage of image collections by scene graph recombination . IEEE Transactions on Visualization & Computer Graphics(TVCG), PP(99):2559--2572 , 2018 . Fei Fang, Miao Yi, Hui Feng, Shenghong Hu, and Chunxia Xiao. Narrative collage of image collections by scene graph recombination. IEEE Transactions on Visualization & Computer Graphics(TVCG), PP(99):2559--2572, 2018.","journal-title":"IEEE Transactions on Visualization & Computer Graphics(TVCG), PP(99):2559--2572"},{"key":"e_1_3_2_2_12_1","volume-title":"Few-shot learning with graph neural networks. arXiv preprint arXiv:1711.04043","author":"Garcia Victor","year":"2017","unstructured":"Victor Garcia and Joan Bruna . Few-shot learning with graph neural networks. arXiv preprint arXiv:1711.04043 , 2017 . Victor Garcia and Joan Bruna. Few-shot learning with graph neural networks. arXiv preprint arXiv:1711.04043, 2017."},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01249-6_9"},{"key":"e_1_3_2_2_14_1","first-page":"770","volume-title":"Jian Sun. Deep Residual Learning for Image Recognition. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR)","author":"He Kaiming","year":"2016","unstructured":"Kaiming He , Xiangyu Zhang , Shaoqing Ren , and Jian Sun. Deep Residual Learning for Image Recognition. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR) , pages 770 -- 778 , 2016 . Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. Deep Residual Learning for Image Recognition. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR), pages 770--778, 2016."},{"key":"e_1_3_2_2_15_1","volume-title":"Proceedings of the Asian Conference on Computer Vision (ACCV)","volume":"169","author":"He Sen","year":"2020","unstructured":"Sen He , Wentong Liao , Hamed R Tavakoli , Michael Yang , Bodo Rosenhahn , and Nicolas Pugeault . Image captioning through image transformer . In Proceedings of the Asian Conference on Computer Vision (ACCV) , volume PP, pages 153-- 169 , 2020 . Sen He, Wentong Liao, Hamed R Tavakoli, Michael Yang, Bodo Rosenhahn, and Nicolas Pugeault. Image captioning through image transformer. In Proceedings of the Asian Conference on Computer Vision (ACCV), volume PP, pages 153--169, 2020."},{"key":"e_1_3_2_2_16_1","volume-title":"Proceedings of the Advances in neural information processing systems (NeurIPS)","volume":"32","author":"Herdade Simao","year":"2019","unstructured":"Simao Herdade , Armin Kappeler , Kofi Boakye , and Joao Soares . Image Captioning : Transforming Objects into Words . In Proceedings of the Advances in neural information processing systems (NeurIPS) , volume 32 , 2019 . Simao Herdade, Armin Kappeler, Kofi Boakye, and Joao Soares. Image Captioning: Transforming Objects into Words. In Proceedings of the Advances in neural information processing systems (NeurIPS), volume 32, 2019."},{"key":"e_1_3_2_2_17_1","volume-title":"Generating video animation from single still image in social media based on intelligent computing. Journal of Visual Communication and Image Representation(JVCI), 71:102812","author":"Hu Tao","year":"2020","unstructured":"Tao Hu , Chao Liang , Geyong Min , Keqin Li , and Chunxia Xiao . Generating video animation from single still image in social media based on intelligent computing. Journal of Visual Communication and Image Representation(JVCI), 71:102812 , 2020 . Tao Hu, Chao Liang, Geyong Min, Keqin Li, and Chunxia Xiao. Generating video animation from single still image in social media based on intelligent computing. Journal of Visual Communication and Image Representation(JVCI), 71:102812, 2020."},{"key":"e_1_3_2_2_18_1","volume-title":"Crd-cgan: Category-consistent and relativistic constraints for diverse text-to-image generation","author":"Hu Tao","year":"2021","unstructured":"Tao Hu , Chengjiang Long , and Chunxia Xiao . Crd-cgan: Category-consistent and relativistic constraints for diverse text-to-image generation . 2021 . Tao Hu, Chengjiang Long, and Chunxia Xiao. Crd-cgan: Category-consistent and relativistic constraints for diverse text-to-image generation. 2021."},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3061927"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2682082"},{"key":"e_1_3_2_2_21_1","first-page":"4633","volume-title":"Xiao-Yong Wei. Attention on Attention for Image Captioning. In Proceedings of the IEEE International Conference on Computer Vision (ICCV)","author":"Huang Lun","year":"2019","unstructured":"Lun Huang , Wenmin Wang , Jie Chen , and Xiao-Yong Wei. Attention on Attention for Image Captioning. In Proceedings of the IEEE International Conference on Computer Vision (ICCV) , pages 4633 -- 4642 , 2019 . Lun Huang, Wenmin Wang, Jie Chen, and Xiao-Yong Wei. Attention on Attention for Image Captioning. In Proceedings of the IEEE International Conference on Computer Vision (ICCV), pages 4633--4642, 2019."},{"key":"e_1_3_2_2_22_1","volume-title":"AAAI Conference on Artificial Intelligence (AAAI)","author":"Islam Ashraful","year":"2021","unstructured":"Ashraful Islam , Chengjiang Long , and Richard Radke . A hybrid attention mechanism for weakly-supervised temporal action localization . In AAAI Conference on Artificial Intelligence (AAAI) , 2021 . Ashraful Islam, Chengjiang Long, and Richard Radke. A hybrid attention mechanism for weakly-supervised temporal action localization. In AAAI Conference on Artificial Intelligence (AAAI), 2021."},{"key":"e_1_3_2_2_23_1","volume-title":"Proceedings of the International Conference on Machine Learning (ICML)","author":"Jiang Lu","year":"2018","unstructured":"Lu Jiang , Zhengyuan Zhou , Thomas Leung , Li Jia Li , and Li Fei-Fei . Mentornet : Learning data-driven curriculum for very deep neural networks on corrupted labels . In Proceedings of the International Conference on Machine Learning (ICML) , 2018 . Lu Jiang, Zhengyuan Zhou, Thomas Leung, Li Jia Li, and Li Fei-Fei. Mentornet: Learning data-driven curriculum for very deep neural networks on corrupted labels. In Proceedings of the International Conference on Machine Learning (ICML), 2018."},{"key":"e_1_3_2_2_24_1","first-page":"4565","volume-title":"Li Fei-Fei. DenseCap: Fully Convolutional Localization Networks for Dense Captioning. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR)","author":"Johnson Justin","year":"2016","unstructured":"Justin Johnson , Andrej Karpathy , and Li Fei-Fei. DenseCap: Fully Convolutional Localization Networks for Dense Captioning. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR) , pages 4565 -- 4574 , 2016 . Justin Johnson, Andrej Karpathy, and Li Fei-Fei. DenseCap: Fully Convolutional Localization Networks for Dense Captioning. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR), pages 4565--4574, 2016."},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2598339"},{"key":"e_1_3_2_2_26_1","first-page":"5425","volume-title":"Proceeding of the International Conference on Learning Representations (ICLR)","author":"Thomas","year":"2017","unstructured":"Thomas N. Kipf and Max Welling. Semi-Supervised Classification with Graph Convolutional Networks . In Proceeding of the International Conference on Learning Representations (ICLR) , pages 5425 -- 5434 , 2017 . Thomas N. Kipf and Max Welling. Semi-Supervised Classification with Graph Convolutional Networks. In Proceeding of the International Conference on Learning Representations (ICLR), pages 5425--5434, 2017."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0981-7"},{"key":"e_1_3_2_2_28_1","first-page":"84","volume-title":"ImageNet Classification with Deep Convolutional Neural Networks","author":"Krizhevsky Alex","year":"2017","unstructured":"Alex Krizhevsky , Ilya Sutskever , and Geoffrey E. Hinton . ImageNet Classification with Deep Convolutional Neural Networks . volume 60 , pages 84 -- 90 , 2017 . Alex Krizhevsky, Ilya Sutskever, and Geoffrey E. Hinton. ImageNet Classification with Deep Convolutional Neural Networks. volume 60, pages 84--90, 2017."},{"key":"e_1_3_2_2_29_1","first-page":"8927","volume-title":"Yi Yang. Entangled Transformer for Image Captioning. In Proceedings of the IEEE International Conference on Computer Vision (ICCV)","author":"Li Guang","year":"2019","unstructured":"Guang Li , Linchao Zhu , Ping Liu , and Yi Yang. Entangled Transformer for Image Captioning. In Proceedings of the IEEE International Conference on Computer Vision (ICCV) , pages 8927 -- 8936 , 2019 . Guang Li, Linchao Zhu, Ping Liu, and Yi Yang. Entangled Transformer for Image Captioning. In Proceedings of the IEEE International Conference on Computer Vision (ICCV), pages 8927--8936, 2019."},{"key":"e_1_3_2_2_30_1","first-page":"74","volume-title":"Text Summarization Branches Out","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin . ROUGE : A package for automatic evaluation of summaries . In Text Summarization Branches Out , pages 74 -- 81 , 2004 . Chin-Yew Lin. ROUGE: A package for automatic evaluation of summaries. In Text Summarization Branches Out, pages 74--81, 2004."},{"key":"e_1_3_2_2_31_1","first-page":"740","volume-title":"Proceedings of the European Conference on Computer Vision (ECCV)","volume":"8693","author":"Lin Tsung-Yi","year":"2014","unstructured":"Tsung-Yi Lin , Michael Maire , Serge Belongie , James Hays , Pietro Perona , Deva Ramanan , Piotr Dollar , and C. Lawrence Zitnick . Microsoft coco : Common objects in context . In Proceedings of the European Conference on Computer Vision (ECCV) , volume 8693 , pages 740 -- 755 , 2014 . Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Dollar, and C.Lawrence Zitnick. Microsoft coco: Common objects in context. In Proceedings of the European Conference on Computer Vision (ECCV), volume 8693, pages 740--755, 2014."},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2019.00176"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.325"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.524"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0834-9"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.345"},{"key":"e_1_3_2_2_37_1","first-page":"20","volume-title":"Abhinav Gupta. The More You Know: Using Knowledge Graphs for Image Classification. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR)","author":"Marino Kenneth","year":"2017","unstructured":"Kenneth Marino , Ruslan Salakhutdinov , and Abhinav Gupta. The More You Know: Using Knowledge Graphs for Image Classification. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR) , pages 20 -- 28 , 2017 . Kenneth Marino, Ruslan Salakhutdinov, and Abhinav Gupta. The More You Know: Using Knowledge Graphs for Image Classification. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR), pages 20--28, 2017."},{"key":"e_1_3_2_2_38_1","first-page":"311","volume-title":"Proceedings of the Annual Meeting of the Association for Computational Linguistics (ACL)","author":"Papineni K","year":"2002","unstructured":"K Papineni , S Roukos , T Ward , and WJ Zhu . BLEU : a method for automatic evaluation of machine translation . In Proceedings of the Annual Meeting of the Association for Computational Linguistics (ACL) , pages 311 -- 318 , 2002 . K Papineni, S Roukos, T Ward, and WJ Zhu. BLEU: a method for automatic evaluation of machine translation. In Proceedings of the Annual Meeting of the Association for Computational Linguistics (ACL), pages 311--318, 2002."},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2577031"},{"key":"e_1_3_2_2_40_1","first-page":"8994","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Shi Liushuai","year":"2021","unstructured":"Liushuai Shi , Le Wang , Chengjiang Long , Sanping Zhou , Mo Zhou , Zhenxing Niu , and Gang Hua . Sgcn : Sparse graph convolution for pedestrian trajectory prediction . In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) , pages 8994 -- 9003 , 2021 . Liushuai Shi, Le Wang, Chengjiang Long, Sanping Zhou, Mo Zhou, Zhenxing Niu, and Gang Hua. Sgcn: Sparse graph convolution for pedestrian trajectory prediction. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pages 8994--9003, 2021."},{"key":"e_1_3_2_2_41_1","volume-title":"Illia Polosukhin. Attention Is All You Need. In Proceedings of the Advances in neural information processing systems (NeurIPS)","volume":"30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani , Noam Shazeer , Niki Parmar , Jakob Uszkoreit , Llion Jones , Aidan N. Gomez , Lukasz Kaiser , and Illia Polosukhin. Attention Is All You Need. In Proceedings of the Advances in neural information processing systems (NeurIPS) , volume 30 , 2017 . Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. Attention Is All You Need. In Proceedings of the Advances in neural information processing systems (NeurIPS), volume 30, 2017."},{"key":"e_1_3_2_2_42_1","first-page":"4566","volume-title":"Devi Parikh. CIDEr: Consensus-based Image Description Evaluation. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR)","author":"Vedantam Ramakrishna","year":"2015","unstructured":"Ramakrishna Vedantam , C. Lawrence Zitnick , and Devi Parikh. CIDEr: Consensus-based Image Description Evaluation. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR) , pages 4566 -- 4575 , 2015 . Ramakrishna Vedantam, C. Lawrence Zitnick, and Devi Parikh. CIDEr: Consensus-based Image Description Evaluation. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR), pages 4566--4575, 2015."},{"key":"e_1_3_2_2_43_1","first-page":"3156","volume-title":"Dumitru Erhan. Show and Tell: A Neural Image Caption Generator. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR)","author":"Vinyals Oriol","year":"2015","unstructured":"Oriol Vinyals , Alexander Toshev , Samy Bengio , and Dumitru Erhan. Show and Tell: A Neural Image Caption Generator. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR) , pages 3156 -- 3164 , 2015 . Oriol Vinyals, Alexander Toshev, Samy Bengio, and Dumitru Erhan. Show and Tell: A Neural Image Caption Generator. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR), pages 3156--3164, 2015."},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2587640"},{"key":"e_1_3_2_2_45_1","first-page":"793","volume-title":"Chris Ding. Image Annotation Using Bi-Relational Graph of Images and Semantic Labels. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR)","author":"Wang Hua","year":"2011","unstructured":"Hua Wang , Heng Huang , and Chris Ding. Image Annotation Using Bi-Relational Graph of Images and Semantic Labels. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR) , pages 793 -- 800 , 2011 . Hua Wang, Heng Huang, and Chris Ding. Image Annotation Using Bi-Relational Graph of Images and Semantic Labels. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR), pages 793--800, 2011."},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.3025661"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.542"},{"key":"e_1_3_2_2_48_1","first-page":"2048","volume-title":"Computer Science","author":"Xu Kelvin","year":"2015","unstructured":"Kelvin Xu , Jimmy Ba , Ryan Kiros , Kyunghyun Cho , Aaron Courville , Ruslan Salakhutdinov , Richard Zemel , and Y. Bengio . Show, attend and tell: Neural image caption generation with visual attention . Computer Science , pages 2048 -- 2057 , 2015 . Kelvin Xu, Jimmy Ba, Ryan Kiros, Kyunghyun Cho, Aaron Courville, Ruslan Salakhutdinov, Richard Zemel, and Y. Bengio. Show, attend and tell: Neural image caption generation with visual attention. Computer Science, pages 2048--2057, 2015."},{"key":"e_1_3_2_2_49_1","article-title":"Adversarially approximated autoencoder for image generation and manipulation","author":"Xu Wenju","year":"2019","unstructured":"Wenju Xu , Shawn Keshmiri , and Guanghui Wang . Adversarially approximated autoencoder for image generation and manipulation . IEEE Transactions on Multimedia (T-MM), pages 2387--2396 , 2019 . Wenju Xu, Shawn Keshmiri, and Guanghui Wang. Adversarially approximated autoencoder for image generation and manipulation. IEEE Transactions on Multimedia (T-MM), pages 2387--2396, 2019.","journal-title":"IEEE Transactions on Multimedia (T-MM), pages 2387--2396"},{"key":"e_1_3_2_2_50_1","first-page":"10677","volume-title":"Jianfei Cai. Auto-Encoding Scene Graphs for Image Captioning. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR)","author":"Yang Xu","year":"2019","unstructured":"Xu Yang , Kaihua Tang , Hanwang Zhang , and Jianfei Cai. Auto-Encoding Scene Graphs for Image Captioning. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR) , pages 10677 -- 10686 , 2019 . Xu Yang, Kaihua Tang, Hanwang Zhang, and Jianfei Cai. Auto-Encoding Scene Graphs for Image Captioning. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR), pages 10677--10686, 2019."},{"key":"e_1_3_2_2_51_1","first-page":"9","volume-title":"Yao and Li Fei-Fei. Grouplet: A Structured Image Representation for Recognizing Human and Object Interactions. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR)","author":"Bangpeng","year":"2010","unstructured":"Bangpeng Yao and Li Fei-Fei. Grouplet: A Structured Image Representation for Recognizing Human and Object Interactions. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR) , pages 9 -- 16 , 2010 . Bangpeng Yao and Li Fei-Fei. Grouplet: A Structured Image Representation for Recognizing Human and Object Interactions. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR), pages 9--16, 2010."},{"key":"e_1_3_2_2_52_1","first-page":"5263","volume-title":"Tao Mei. Incorporating Copying Mechanism in Image Captioning for Learning Novel Objects. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR)","author":"Yao Ting","year":"2017","unstructured":"Ting Yao , Yingwei Pan , Yehao Li , and Tao Mei. Incorporating Copying Mechanism in Image Captioning for Learning Novel Objects. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR) , pages 5263 -- 5271 , 2017 . Ting Yao, Yingwei Pan, Yehao Li, and Tao Mei. Incorporating Copying Mechanism in Image Captioning for Learning Novel Objects. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR), pages 5263--5271, 2017."},{"key":"e_1_3_2_2_53_1","first-page":"711","volume-title":"Tao Mei. Exploring Visual Relationship for Image Captioning. In Proceedings of the European Conference on Computer Vision (ECCV)","volume":"11218","author":"Yao Ting","year":"2018","unstructured":"Ting Yao , Yingwei Pan , Yehao Li , and Tao Mei. Exploring Visual Relationship for Image Captioning. In Proceedings of the European Conference on Computer Vision (ECCV) , volume 11218 , pages 711 -- 727 , 2018 . Ting Yao, Yingwei Pan, Yehao Li, and Tao Mei. Exploring Visual Relationship for Image Captioning. In Proceedings of the European Conference on Computer Vision (ECCV), volume 11218, pages 711--727, 2018."},{"key":"e_1_3_2_2_54_1","first-page":"2621","volume-title":"Tao Mei. Hierarchy Parsing for Image Captioning. In Proceedings of the IEEE International Conference on Computer Vision (ICCV)","author":"Yao Ting","year":"2019","unstructured":"Ting Yao , Yingwei Pan , Yehao Li , and Tao Mei. Hierarchy Parsing for Image Captioning. In Proceedings of the IEEE International Conference on Computer Vision (ICCV) , pages 2621 -- 2629 , 2019 . Ting Yao, Yingwei Pan, Yehao Li, and Tao Mei. Hierarchy Parsing for Image Captioning. In Proceedings of the IEEE International Conference on Computer Vision (ICCV), pages 2621--2629, 2019."},{"key":"e_1_3_2_2_55_1","first-page":"4904","volume-title":"Tao Mei. Boosting Image Captioning with Attributes. In Proceedings of the IEEE International Conference on Computer Vision (ICCV)","author":"Yao Ting","year":"2017","unstructured":"Ting Yao , Yingwei Pan , Yehao Li , Zhaofan Qiu , and Tao Mei. Boosting Image Captioning with Attributes. In Proceedings of the IEEE International Conference on Computer Vision (ICCV) , pages 4904 -- 4912 , 2017 . Ting Yao, Yingwei Pan, Yehao Li, Zhaofan Qiu, and Tao Mei. Boosting Image Captioning with Attributes. In Proceedings of the IEEE International Conference on Computer Vision (ICCV), pages 4904--4912, 2017."},{"key":"e_1_3_2_2_56_1","first-page":"4651","volume-title":"Jiebo Luo. Image Captioning with Semantic Attention. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR)","author":"You Quanzeng","year":"2016","unstructured":"Quanzeng You , Hailin Jin , Zhaowen Wang , Chen Fang , and Jiebo Luo. Image Captioning with Semantic Attention. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR) , pages 4651 -- 4659 , 2016 . Quanzeng You, Hailin Jin, Zhaowen Wang, Chen Fang, and Jiebo Luo. Image Captioning with Semantic Attention. In Proceedings of the IEEE International Computer Vision and Pattern Recognition (CVPR), pages 4651--4659, 2016."},{"key":"e_1_3_2_2_57_1","article-title":"A two-stage attentive network for single image super resolution","author":"Zhang Jiqing","year":"2021","unstructured":"Jiqing Zhang , Chengjiang Long , Yuxin Wang , Haiyin Piao , Haiyang Mei , Xin Yang , and Baocai Yin Yin . A two-stage attentive network for single image super resolution . IEEE Transactions on Circuits and Systems for Video Technology , 2021 . Jiqing Zhang, Chengjiang Long, Yuxin Wang, Haiyin Piao, Haiyang Mei, Xin Yang, and Baocai Yin Yin. A two-stage attentive network for single image super resolution. IEEE Transactions on Circuits and Systems for Video Technology, 2021.","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology"},{"key":"e_1_3_2_2_58_1","first-page":"1","volume-title":"Proceedings of the 2020 IEEE International Conference on Multimedia and Expo (ICME)","author":"Zhang Jiqing","year":"2020","unstructured":"Jiqing Zhang , Chengjiang Long , Yuxin Wang , Xin Yang , Haiyang Mei , and Baocai Yin . Multi-context and enhanced reconstruction network for single image super resolution . In Proceedings of the 2020 IEEE International Conference on Multimedia and Expo (ICME) , pages 1 -- 6 , 2020 . Jiqing Zhang, Chengjiang Long, Yuxin Wang, Xin Yang, Haiyang Mei, and Baocai Yin. Multi-context and enhanced reconstruction network for single image super resolution. In Proceedings of the 2020 IEEE International Conference on Multimedia and Expo (ICME), pages 1--6, 2020."},{"key":"e_1_3_2_2_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2017.2712283"},{"key":"e_1_3_2_2_60_1","doi-asserted-by":"crossref","DOI":"10.1515\/9783110524116","article-title":"Palette-based image recoloring using color decomposition optimization","author":"Zhang Qing","year":"2017","unstructured":"Qing Zhang , Chunxia Xiao , Hanqiu Sun , and Feng Tang . Palette-based image recoloring using color decomposition optimization . IEEE Transactions on Image Processing, PP(4):1--1 , 2017 . Qing Zhang, Chunxia Xiao, Hanqiu Sun, and Feng Tang. Palette-based image recoloring using color decomposition optimization. IEEE Transactions on Image Processing, PP(4):1--1, 2017.","journal-title":"IEEE Transactions on Image Processing, PP(4):1--1"},{"key":"e_1_3_2_2_61_1","first-page":"7623","volume-title":"Zhu and Shuqiang Jiang. Deep Structured Learning for Visual Relationship Detection. In Proceedings of the Association for the Advancement of Artificial Intelligence (AAAI)","author":"Yaohui","year":"2018","unstructured":"Yaohui Zhu and Shuqiang Jiang. Deep Structured Learning for Visual Relationship Detection. In Proceedings of the Association for the Advancement of Artificial Intelligence (AAAI) , pages 7623 -- 7630 , 2018 . Yaohui Zhu and Shuqiang Jiang. Deep Structured Learning for Visual Relationship Detection. In Proceedings of the Association for the Advancement of Artificial Intelligence (AAAI), pages 7623--7630, 2018."}],"event":{"name":"MM '21: ACM Multimedia Conference","location":"Virtual Event China","acronym":"MM '21","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 29th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475439","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3474085.3475439","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:48:33Z","timestamp":1750193313000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475439"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,17]]},"references-count":61,"alternative-id":["10.1145\/3474085.3475439","10.1145\/3474085"],"URL":"https:\/\/doi.org\/10.1145\/3474085.3475439","relation":{},"subject":[],"published":{"date-parts":[[2021,10,17]]},"assertion":[{"value":"2021-10-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}