{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:23:42Z","timestamp":1750220622335,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":18,"publisher":"ACM","license":[{"start":{"date-parts":[[2020,10,12]],"date-time":"2020-10-12T00:00:00Z","timestamp":1602460800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100012659","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61525102, 61831005"],"award-info":[{"award-number":["61525102, 61831005"]}],"id":[{"id":"10.13039\/501100012659","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2020,10,12]]},"DOI":"10.1145\/3394171.3416288","type":"proceedings-article","created":{"date-parts":[[2020,10,12]],"date-time":"2020-10-12T12:26:25Z","timestamp":1602505585000},"page":"4610-4614","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Multi-stage Tag Guidance Network in Video Caption"],"prefix":"10.1145","author":[{"given":"Lanxiao","family":"Wang","sequence":"first","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}]},{"given":"Chao","family":"Shang","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}]},{"given":"Heqian","family":"Qiu","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}]},{"given":"Taijin","family":"Zhao","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}]},{"given":"Benliu","family":"Qiu","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}]},{"given":"Hongliang","family":"Li","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}]}],"member":"320","published-online":{"date-parts":[[2020,10,12]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"crossref","unstructured":"Joao Carreira and Andrew Zisserman. 2017. Quo Vadis Action Recognition? A New Model and the Kinetics Dataset. (2017).  Joao Carreira and Andrew Zisserman. 2017. Quo Vadis Action Recognition? A New Model and the Kinetics Dataset. (2017).","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_2_2_1","unstructured":"Haoran Chen Jianmin Li and Xiaolin Hu. 2020. Delving Deeper into the Decoder for Video Captioning. (2020).  Haoran Chen Jianmin Li and Xiaolin Hu. 2020. Delving Deeper into the Decoder for Video Captioning. (2020)."},{"key":"e_1_3_2_2_3_1","unstructured":"Kyunghyun Cho Bart Van Merrienboer Caglar Gulcehre Dzmitry Bahdanau Fethi Bougares Holger Schwenk and Yoshua Bengio. 2014. Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation. Computer Science (2014).  Kyunghyun Cho Bart Van Merrienboer Caglar Gulcehre Dzmitry Bahdanau Fethi Bougares Holger Schwenk and Yoshua Bengio. 2014. Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation. Computer Science (2014)."},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"crossref","unstructured":"Ji Feng. 2017. Deep MIML Network. In AAAI-17.  Ji Feng. 2017. Deep MIML Network. In AAAI-17.","DOI":"10.1609\/aaai.v31i1.10890"},{"key":"e_1_3_2_2_5_1","unstructured":"Sergio Guadarrama Niveda Krishnamoorthy Girish Malkarnenkar Subhashini Venugopalan Raymond Mooney Trevor Darrell and Kate Saenko. 2015. YouTube2Text: Recognizing and Describing Arbitrary Activities Using Semantic Hierarchies and Zero-shot Recognition. (2015).  Sergio Guadarrama Niveda Krishnamoorthy Girish Malkarnenkar Subhashini Venugopalan Raymond Mooney Trevor Darrell and Kate Saenko. 2015. YouTube2Text: Recognizing and Describing Arbitrary Activities Using Semantic Hierarchies and Zero-shot Recognition. (2015)."},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"e_1_3_2_2_7_1","unstructured":"Sergey Ioffe and Christian Szegedy. 2015. Batch normalization: Accelerating deep network training by reducing internal covariate shift. arXiv preprint arXiv:1502.03167 (2015).  Sergey Ioffe and Christian Szegedy. 2015. Batch normalization: Accelerating deep network training by reducing internal covariate shift. arXiv preprint arXiv:1502.03167 (2015)."},{"volume-title":"Jointly Localizing and Describing Events for Dense Video Captioning. In 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","year":"2018","author":"Li Yehao","key":"e_1_3_2_2_8_1"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"crossref","unstructured":"Taesup Moon Heeyoul Choi Hoshik Lee and Inchul Song. 2015. [IEEE 2015 IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU) - Scottsdale AZ USA (2015.12.13--2015.12.17)] 2015 IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU) - RNNDROP: A novel dropout for RNNS in ASR. (2015) 65--70.  Taesup Moon Heeyoul Choi Hoshik Lee and Inchul Song. 2015. [IEEE 2015 IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU) - Scottsdale AZ USA (2015.12.13--2015.12.17)] 2015 IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU) - RNNDROP: A novel dropout for RNNS in ASR. (2015) 65--70.","DOI":"10.1109\/ASRU.2015.7404775"},{"key":"e_1_3_2_2_10_1","unstructured":"Yingwei Pan Yehao Li Jianjie Luo Jun Xu Ting Yao and Tao Mei. 2020. Auto-captions on GIF: A Large-scale Video-sentence Dataset for Vision-language Pre-training. arXiv preprint arXiv:2007.02375 (2020).  Yingwei Pan Yehao Li Jianjie Luo Jun Xu Ting Yao and Tao Mei. 2020. Auto-captions on GIF: A Large-scale Video-sentence Dataset for Vision-language Pre-training. arXiv preprint arXiv:2007.02375 (2020)."},{"key":"e_1_3_2_2_11_1","unstructured":"Yingwei Pan Tao Mei Ting Yao Houqiang Li and Yong Rui. 2016. Jointly Modeling Embedding and Translation to Bridge Video and Language. (2016).  Yingwei Pan Tao Mei Ting Yao Houqiang Li and Yong Rui. 2016. Jointly Modeling Embedding and Translation to Bridge Video and Language. (2016)."},{"volume-title":"Video Captioning with Transferred Semantic Attributes. In 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","year":"2017","author":"Pan Yingwei","key":"e_1_3_2_2_12_1"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2984062"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"crossref","unstructured":"Subhashini Venugopalan Marcus Rohrbach Jeff Donahue Raymond Mooney Trevor Darrell and Kate Saenko. 2015. Sequence to Sequence -- Video to Text.  Subhashini Venugopalan Marcus Rohrbach Jeff Donahue Raymond Mooney Trevor Darrell and Kate Saenko. 2015. Sequence to Sequence -- Video to Text.","DOI":"10.1109\/ICCV.2015.515"},{"volume-title":"Show and Tell: A Neural Image Caption Generator. In 2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR) .","year":"2015","author":"Vinyals Oriol","key":"e_1_3_2_2_15_1"},{"key":"e_1_3_2_2_16_1","unstructured":"Saining Xie Ross Girshick Piotr Doll\u00e1r Zhuowen Tu and Kaiming He. 2016. Aggregated Residual Transformations for Deep Neural Networks. (2016).  Saining Xie Ross Girshick Piotr Doll\u00e1r Zhuowen Tu and Kaiming He. 2016. Aggregated Residual Transformations for Deep Neural Networks. (2016)."},{"volume-title":"MSR-VTT: A Large Video Description Dataset for Bridging Video and Language. In Conference on Computer Vision and Pattern Recognition (CVPR) .","year":"2016","author":"Xu Jun","key":"e_1_3_2_2_17_1"},{"volume-title":"ECO: Efficient Convolutional Network for Online Video Understanding.","year":"2018","author":"Zolfaghari Mohammadreza","key":"e_1_3_2_2_18_1"}],"event":{"name":"MM '20: The 28th ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Seattle WA USA","acronym":"MM '20"},"container-title":["Proceedings of the 28th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3394171.3416288","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3394171.3416288","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T22:01:24Z","timestamp":1750197684000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3394171.3416288"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,10,12]]},"references-count":18,"alternative-id":["10.1145\/3394171.3416288","10.1145\/3394171"],"URL":"https:\/\/doi.org\/10.1145\/3394171.3416288","relation":{},"subject":[],"published":{"date-parts":[[2020,10,12]]},"assertion":[{"value":"2020-10-12","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}