{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T02:32:48Z","timestamp":1775010768874,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":22,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,12,22]],"date-time":"2021-12-22T00:00:00Z","timestamp":1640131200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100002858","name":"China Postdoctoral Science Foundation","doi-asserted-by":"publisher","award":["2019TQ0286,2020M682349"],"award-info":[{"award-number":["2019TQ0286,2020M682349"]}],"id":[{"id":"10.13039\/501100002858","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62006211"],"award-info":[{"award-number":["62006211"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,12,22]]},"DOI":"10.1145\/3512576.3512606","type":"proceedings-article","created":{"date-parts":[[2022,4,11]],"date-time":"2022-04-11T16:22:04Z","timestamp":1649694124000},"page":"168-173","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Human Action Captioning based on a GRU+LSTM+Attention Model"],"prefix":"10.1145","author":[{"given":"Lijuan","family":"Zhou","sequence":"first","affiliation":[{"name":"School of Computer and Artificial Intelligence, Zhengzhou University, China and Zhengzhou Zoneyet Technology Co., Ltd., China"}]},{"given":"Weicong","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Computer and Artificial Intelligence, Zhengzhou University, China and Zhengzhou Zoneyet Technology Co., Ltd., China"}]},{"given":"Xiaojie","family":"Qian","sequence":"additional","affiliation":[{"name":"School of Computer and Artificial Intelligence, Zhengzhou University, China"}]}],"member":"320","published-online":{"date-parts":[[2022,4,11]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"5998","article-title":"Attention is all you need","author":"Shazeer Noam","year":"2017","unstructured":"Vaswani, Ashish, Noam Shazeer , Niki Parmar , Jakob Uszkoreit , Llion Jones , Aidan N. Gomez , Lukasz Kaiser , and Illia Polosukhin . \" Attention is all you need .\" In Proceedings of Advances in Neural Information Processing Systems , pp. 5998 - 6008 . 2017 . Vaswani, Ashish, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. \"Attention is all you need.\" In Proceedings of Advances in Neural Information Processing Systems, pp. 5998-6008. 2017.","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1023\/A:1020346032608"},{"key":"e_1_3_2_1_3_1","volume-title":"Proceedings of 2013 IEEE International Conference on Computer Vision","author":"Qiu Wei","year":"2013","unstructured":"Rohrbach, Marcus, Wei Qiu , Ivan Titov , Stefan Thater , Manfred Pinkal , and Bernt Schiele . \u201c Translating Video Content to Natural Language Descriptions .\u201d In Proceedings of 2013 IEEE International Conference on Computer Vision ( 2013 ). doi:10.1109\/iccv.2013.61. 10.1109\/iccv.2013.61 Rohrbach, Marcus, Wei Qiu, Ivan Titov, Stefan Thater, Manfred Pinkal, and Bernt Schiele. \u201cTranslating Video Content to Natural Language Descriptions.\u201d In Proceedings of 2013 IEEE International Conference on Computer Vision (2013). doi:10.1109\/iccv.2013.61."},{"key":"e_1_3_2_1_4_1","volume-title":"Proceedings of 2013 IEEE International Conference on Computer Vision","author":"Krishnamoorthy Niveda","year":"2013","unstructured":"Guadarrama, Sergio, Niveda Krishnamoorthy , Girish Malkarnenkar , Subhashini Venugopalan , Raymond Mooney , Trevor Darrell , and Kate Saenko . \u201c YouTube2Text: Recognizing and Describing Arbitrary Activities Using Semantic Hierarchies and Zero-Shot Recognition .\u201d In Proceedings of 2013 IEEE International Conference on Computer Vision ( 2013 ). doi:10.1109\/iccv.2013.337. 10.1109\/iccv.2013.337 Guadarrama, Sergio, Niveda Krishnamoorthy, Girish Malkarnenkar, Subhashini Venugopalan, Raymond Mooney, Trevor Darrell, and Kate Saenko. \u201cYouTube2Text: Recognizing and Describing Arbitrary Activities Using Semantic Hierarchies and Zero-Shot Recognition.\u201d In Proceedings of 2013 IEEE International Conference on Computer Vision (2013). doi:10.1109\/iccv.2013.337."},{"key":"e_1_3_2_1_5_1","first-page":"15","volume-title":"Proceedings of the 2015 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","author":"Xu Huijuan","year":"2015","unstructured":"Venugopalan, Subhashini, Huijuan Xu , Jeff Donahue , Marcus Rohrbach , Raymond Mooney , and Kate Saenko . \u201c Translating Videos to Natural Language Using Deep Recurrent Neural Networks .\u201d In Proceedings of the 2015 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies ( 2015 ). doi:10.3115\/v1\/n 15 - 1173 . 10.3115\/v1 Venugopalan, Subhashini, Huijuan Xu, Jeff Donahue, Marcus Rohrbach, Raymond Mooney, and Kate Saenko. \u201cTranslating Videos to Natural Language Using Deep Recurrent Neural Networks.\u201d In Proceedings of the 2015 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (2015). doi:10.3115\/v1\/n15-1173."},{"key":"e_1_3_2_1_6_1","volume-title":"Proceedings of 2015 IEEE International Conference on Computer Vision","author":"Torabi Atousa","year":"2015","unstructured":"Yao, Li, Atousa Torabi , Kyunghyun Cho , Nicolas Ballas , Christopher Pal , Hugo Larochelle , and Aaron Courville . \u201c Describing Videos by Exploiting Temporal Structure .\u201d In Proceedings of 2015 IEEE International Conference on Computer Vision ( 2015 ). doi:10.1109\/iccv.2015.512. 10.1109\/iccv.2015.512 Yao, Li, Atousa Torabi, Kyunghyun Cho, Nicolas Ballas, Christopher Pal, Hugo Larochelle, and Aaron Courville. \u201cDescribing Videos by Exploiting Temporal Structure.\u201d In Proceedings of 2015 IEEE International Conference on Computer Vision (2015). doi:10.1109\/iccv.2015.512."},{"key":"e_1_3_2_1_7_1","volume-title":"PMLR","author":"Li Yingming","year":"2018","unstructured":"Chen, Ming, Yingming Li , Zhongfei Zhang , and Siyu Huang . \"Tvt : Two-view transformer network for video captioning.\" In Proceedings of Asian Conference on Machine Learning, pp. 847-862 . PMLR , 2018 . Chen, Ming, Yingming Li, Zhongfei Zhang, and Siyu Huang. \"Tvt: Two-view transformer network for video captioning.\" In Proceedings of Asian Conference on Machine Learning, pp. 847-862. PMLR, 2018."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00911"},{"key":"e_1_3_2_1_9_1","volume-title":"Proceedings of the 27th ACM International Conference on Multimedia","author":"Chen Zhenzhong","year":"2019","unstructured":"Hu, Yaosi, Zhenzhong Chen , Zheng-Jun Zha , and Feng Wu . \u201c Hierarchical Global-Local Temporal Modeling for Video Captioning .\u201d In Proceedings of the 27th ACM International Conference on Multimedia ( 2019 ). doi:10.1145\/3343031.3351072. 10.1145\/3343031.3351072 Hu, Yaosi, Zhenzhong Chen, Zheng-Jun Zha, and Feng Wu. \u201cHierarchical Global-Local Temporal Modeling for Video Captioning.\u201d In Proceedings of the 27th ACM International Conference on Multimedia (2019). doi:10.1145\/3343031.3351072."},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Shi Yaya","year":"2020","unstructured":"Zhang, Ziqi, Yaya Shi , Chunfeng Yuan , Bing Li , Peijin Wang , Weiming Hu , and Zheng-Jun Zha . \u201c Object Relational Graph With Teacher-Recommended Learning for Video Captioning .\u201d In Proceedings of 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition ( 2020 ). doi:10.1109\/cvpr42600.2020.01329. 10.1109\/cvpr42600.2020.01329 Zhang, Ziqi, Yaya Shi, Chunfeng Yuan, Bing Li, Peijin Wang, Weiming Hu, and Zheng-Jun Zha. \u201cObject Relational Graph With Teacher-Recommended Learning for Video Captioning.\u201d In Proceedings of 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2020). doi:10.1109\/cvpr42600.2020.01329."},{"key":"e_1_3_2_1_11_1","volume-title":"A simple and performant baseline for vision and language.\" arXiv preprint arXiv:1908.03557","author":"Harold Liunian","year":"2019","unstructured":"Li, Liunian Harold , Mark Yatskar , Da Yin , Cho-Jui Hsieh , and Kai-Wei Chang . \"Visualbert : A simple and performant baseline for vision and language.\" arXiv preprint arXiv:1908.03557 ( 2019 ). Li, Liunian Harold, Mark Yatskar, Da Yin, Cho-Jui Hsieh, and Kai-Wei Chang. \"Visualbert: A simple and performant baseline for vision and language.\" arXiv preprint arXiv:1908.03557 (2019)."},{"key":"e_1_3_2_1_12_1","volume-title":"Proceedings of 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Yang Yi","year":"2020","unstructured":"Zhu, Linchao, and Yi Yang . \u201c ActBERT: Learning Global-Local Video-Text Representations .\u201d In Proceedings of 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition ( 2020 ). doi:10.1109\/cvpr42600.2020.00877. 10.1109\/cvpr42600.2020.00877 Zhu, Linchao, and Yi Yang. \u201cActBERT: Learning Global-Local Video-Text Representations.\u201d In Proceedings of 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2020). doi:10.1109\/cvpr42600.2020.00877."},{"key":"e_1_3_2_1_13_1","first-page":"7005","volume-title":"End-to-End Learning of Video-Based Text Generation From Multimodal Inputs.\" In Proceedings of 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Bertasius Gedas","year":"2021","unstructured":"Lin, Xudong, Gedas Bertasius , Jue Wang , Shih-Fu Chang , Devi Parikh , and Lorenzo Torresani . \"VX2TEXT : End-to-End Learning of Video-Based Text Generation From Multimodal Inputs.\" In Proceedings of 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition , pp. 7005 - 7015 . 2021 . Lin, Xudong, Gedas Bertasius, Jue Wang, Shih-Fu Chang, Devi Parikh, and Lorenzo Torresani. \"VX2TEXT: End-to-End Learning of Video-Based Text Generation From Multimodal Inputs.\" In Proceedings of 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7005-7015. 2021."},{"key":"e_1_3_2_1_14_1","volume-title":"Proceedings of 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Qi Zhongang","year":"2021","unstructured":"Zhang, Ziqi, Zhongang Qi , Chunfeng Yuan , Ying Shan , Bing Li , Ying Deng , and Weiming Hu . \u201c Open-Book Video Captioning with Retrieve-Copy-Generate Network .\u201d In Proceedings of 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition ( 2021 ). doi:10.1109\/cvpr46437.2021.00971. 10.1109\/cvpr46437.2021.00971 Zhang, Ziqi, Zhongang Qi, Chunfeng Yuan, Ying Shan, Bing Li, Ying Deng, and Weiming Hu. \u201cOpen-Book Video Captioning with Retrieve-Copy-Generate Network.\u201d In Proceedings of 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2021). doi:10.1109\/cvpr46437.2021.00971."},{"key":"e_1_3_2_1_15_1","first-page":"234","article-title":"Sketch, Ground, and Refine: Top-Down Dense Video Captioning","author":"Chen Shizhe","year":"2021","unstructured":"Deng, Chaorui, Shizhe Chen , Da Chen , Yuan He , and Qi Wu . \" Sketch, Ground, and Refine: Top-Down Dense Video Captioning .\" In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition , pp. 234 - 243 . 2021 . Deng, Chaorui, Shizhe Chen, Da Chen, Yuan He, and Qi Wu. \"Sketch, Ground, and Refine: Top-Down Dense Video Captioning.\" In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 234-243. 2021.","journal-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition"},{"key":"e_1_3_2_1_16_1","unstructured":"Bahdanau Dzmitry Kyunghyun Cho and Yoshua Bengio. \"Neural machine translation by jointly learning to align and translate.\" arXiv preprint arXiv:1409.0473 (2014).  Bahdanau Dzmitry Kyunghyun Cho and Yoshua Bengio. \"Neural machine translation by jointly learning to align and translate.\" arXiv preprint arXiv:1409.0473 (2014)."},{"key":"e_1_3_2_1_17_1","first-page":"15","volume-title":"Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing","author":"Pham Hieu","year":"2015","unstructured":"Luong, Thang, Hieu Pham , and Christopher D. Manning . \u201cEffective Approaches to Attention-Based Neural Machine Translation .\u201d In Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing ( 2015 ). doi:10.18653\/v1\/d 15 - 1166 . 10.18653\/v1 Luong, Thang, Hieu Pham, and Christopher D. Manning. \u201cEffective Approaches to Attention-Based Neural Machine Translation.\u201d In Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing (2015). doi:10.18653\/v1\/d15-1166."},{"key":"e_1_3_2_1_18_1","unstructured":"Mikolov Tomas Kai Chen Greg Corrado and Jeffrey Dean. \"Efficient estimation of word representations in vector space.\" arXiv preprint arXiv:1301.3781 (2013).  Mikolov Tomas Kai Chen Greg Corrado and Jeffrey Dean. \"Efficient estimation of word representations in vector space.\" arXiv preprint arXiv:1301.3781 (2013)."},{"key":"e_1_3_2_1_19_1","first-page":"14","volume-title":"Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing","author":"Socher Richard","year":"2014","unstructured":"Pennington, Jeffrey, Richard Socher , and Christopher Manning . \u201c Glove: Global Vectors for Word Representation .\u201d Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing ( 2014 ). doi:10.3115\/v1\/d 14 - 1162 . 10.3115\/v1 Pennington, Jeffrey, Richard Socher, and Christopher Manning. \u201cGlove: Global Vectors for Word Representation.\u201d Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (2014). doi:10.3115\/v1\/d14-1162."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2017.06.035"},{"key":"e_1_3_2_1_21_1","volume-title":"Proceedings of the 40th Annual Meeting on Association for Computational Linguistics","author":"Roukos Salim","year":"2001","unstructured":"Papineni, Kishore, Salim Roukos , Todd Ward , and Wei-Jing Zhu . \u201c BLEU .\u201d In Proceedings of the 40th Annual Meeting on Association for Computational Linguistics ( 2001 ). doi:10.3115\/1073083.1073135. 10.3115\/1073083.1073135 Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. \u201cBLEU.\u201d In Proceedings of the 40th Annual Meeting on Association for Computational Linguistics (2001). doi:10.3115\/1073083.1073135."},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of the Second Workshop on Statistical Machine Translation","author":"Agarwal Abhaya","year":"2007","unstructured":"Lavie, Alon, and Abhaya Agarwal . \u201c Meteor .\u201d Proceedings of the Second Workshop on Statistical Machine Translation ( 2007 ). doi:10.3115\/1626355.1626389. 10.3115\/1626355.1626389 Lavie, Alon, and Abhaya Agarwal. \u201cMeteor.\u201d Proceedings of the Second Workshop on Statistical Machine Translation (2007). doi:10.3115\/1626355.1626389."}],"event":{"name":"ICIT 2021: IoT and Smart City","location":"Guangzhou China","acronym":"ICIT 2021"},"container-title":["2021 The 9th International Conference on Information Technology: IoT and Smart City"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3512576.3512606","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3512576.3512606","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:11:51Z","timestamp":1750191111000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3512576.3512606"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,12,22]]},"references-count":22,"alternative-id":["10.1145\/3512576.3512606","10.1145\/3512576"],"URL":"https:\/\/doi.org\/10.1145\/3512576.3512606","relation":{},"subject":[],"published":{"date-parts":[[2021,12,22]]},"assertion":[{"value":"2022-04-11","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}