{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:21:56Z","timestamp":1750220516346,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":26,"publisher":"ACM","license":[{"start":{"date-parts":[[2020,12,11]],"date-time":"2020-12-11T00:00:00Z","timestamp":1607644800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2020,12,11]]},"DOI":"10.1145\/3445815.3445845","type":"proceedings-article","created":{"date-parts":[[2021,3,17]],"date-time":"2021-03-17T17:05:28Z","timestamp":1616000728000},"page":"181-186","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Multi-attention mechanism for Chinese description of videos"],"prefix":"10.1145","author":[{"given":"Hu","family":"Liu","sequence":"first","affiliation":[{"name":"Nanjing University of Aeronautics and Astronautics, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Junxiu","family":"Wu","sequence":"additional","affiliation":[{"name":"Nanjing University of Aeronautics and Astronautics, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiabin","family":"Yuan","sequence":"additional","affiliation":[{"name":"Nanjing University of Aeronautics and Astronautics, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2021,3,17]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6881"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2020\/131"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.5555\/2002472.2002497"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/2911996.2912049"},{"key":"e_1_3_2_1_5_1","volume-title":"The Fifth International Academic Conference for Graduates, NUAA","author":"Du X.","year":"2018","unstructured":"Du , X. , Yuan , J. , Dai , Y. : Chinese Description Generation of Open-domain Videos , The Fifth International Academic Conference for Graduates, NUAA , 2018 . Du, X., Yuan, J., Dai, Y.: Chinese Description Generation of Open-domain Videos, The Fifth International Academic Conference for Graduates, NUAA , 2018."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.515"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.5555\/2891460.2891535"},{"key":"e_1_3_2_1_8_1","volume-title":"Cipolla","author":"Patraucean V.","year":"2015","unstructured":"Patraucean , V. , Handa , A. , Cipolla , R. : Spatio-temporal video autoencoder with differentiable memory. CoRR , arXiv:1511.06309 ( 2015 ) Patraucean, V., Handa, A., Cipolla, R.: Spatio-temporal video autoencoder with differentiable memory. CoRR, arXiv:1511.06309 (2015)"},{"key":"e_1_3_2_1_9_1","first-page":"362","volume-title":"Wang","author":"Wu C.","year":"2018","unstructured":"Wu , C. , Wei , Y. , Chu , X. , Sun , W.C. , Su , F. , Wang , L.Q. : Hierarchical attention-based multimodal fusion for video captioning, In Neurocomputing , 2018 , pp. 362 - 370 . Wu, C., Wei, Y., Chu, X., Sun, W.C., Su, F., Wang, L.Q.: Hierarchical attention-based multimodal fusion for video captioning, In Neurocomputing, 2018, pp. 362-370."},{"key":"e_1_3_2_1_10_1","first-page":"2528","volume-title":"Conference on Computer Vision and Pattern Recognition Workshops, CVPR","author":"Hori C.","year":"2018","unstructured":"Hori , C. , Hori , T. , Wichern , G. , : Multimodal Attention for Fusion of Audio and Spatiotemporal Features for Video Description . In: Conference on Computer Vision and Pattern Recognition Workshops, CVPR , 2018 , pp. 2528 - 2531 . Hori, C., Hori, T., Wichern, G., : Multimodal Attention for Fusion of Audio and Spatiotemporal Features for Video Description. In: Conference on Computer Vision and Pattern Recognition Workshops, CVPR, 2018, pp. 2528-2531."},{"key":"e_1_3_2_1_11_1","first-page":"6598","volume-title":"Conference on Computer Vision and Pattern Recognition, CVPR","author":"Rohrbach M.","year":"2019","unstructured":"Rohrbach , M. , Darrell , T. , Rohrbach , A. : Adversarial Inference for Multi-Sentence Video Description , Conference on Computer Vision and Pattern Recognition, CVPR , 2019 , pp. 6598 - 6608 . Rohrbach, M., Darrell, T., Rohrbach, A.: Adversarial Inference for Multi-Sentence Video Description, Conference on Computer Vision and Pattern Recognition, CVPR , 2019, pp. 6598-6608."},{"key":"e_1_3_2_1_12_1","first-page":"2048","volume-title":"Attend and Tell: Neural Image Caption Generation with Visual Attention. In: International Conference on Machine Learning, ICML","author":"Xu K.","year":"2015","unstructured":"Xu , K. , Ba , J. , Kiros , R. , : Show , Attend and Tell: Neural Image Caption Generation with Visual Attention. In: International Conference on Machine Learning, ICML , 2015 , pp. 2048 - 2057 . Xu, K., Ba, J., Kiros, R., : Show, Attend and Tell: Neural Image Caption Generation with Visual Attention. In: International Conference on Machine Learning, ICML, 2015, pp. 2048-2057."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.512"},{"key":"e_1_3_2_1_14_1","first-page":"1097","volume-title":"International Conference on Neural Information Processing Systems, Curran Associates Inc","volume":"25","author":"Krizhevsky A.","year":"2012","unstructured":"Krizhevsky , A. , Sutskever , I. , Hinton , G.E. , ImageNet classification with deep convolutional neural networks . In: International Conference on Neural Information Processing Systems, Curran Associates Inc , 2012 , vol. 25 , pp. 1097 - 1105 . Krizhevsky, A., Sutskever, I., Hinton, G.E., ImageNet classification with deep convolutional neural networks. In: International Conference on Neural Information Processing Systems, Curran Associates Inc, 2012, vol. 25, pp. 1097-1105."},{"key":"e_1_3_2_1_15_1","first-page":"91","volume-title":"Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks. In: Annual Conference on Neural Information Processing Systems, NIPS","author":"Ren S.","year":"2015","unstructured":"Ren , S. , He , K. , Liu , B. , Girshick , R.B. , Sun , J. , Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks. In: Annual Conference on Neural Information Processing Systems, NIPS , 2015 , pp. 91 - 99 . Ren, S., He, K., Liu, B., Girshick, R.B., Sun, J., Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks. In: Annual Conference on Neural Information Processing Systems, NIPS, 2015, pp. 91-99."},{"key":"e_1_3_2_1_16_1","volume-title":"The kinetics human action video dataset","author":"Kay W.","year":"2017","unstructured":"Kay , W. , Carreira , J. , Simonyan , K. , : The kinetics human action video dataset , 2017 , arXiv preprint arXiv:1705.06950. Kay, W., Carreira, J., Simonyan, K., : The kinetics human action video dataset, 2017, arXiv preprint arXiv:1705.06950."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","unstructured":"Giannakopoulos T.: Pyaudioanalysis: an open-source python library for audio signal analysis. PLoS ONE 10(12) e0144610 (2015)  Giannakopoulos T.: Pyaudioanalysis: an open-source python library for audio signal analysis. PLoS ONE 10(12) e0144610 (2015)","DOI":"10.1371\/journal.pone.0144610"},{"key":"e_1_3_2_1_19_1","first-page":"4902","volume-title":"D.: State-of-the-art Chinese Word Segmentation with Bi-LSTMs. In: Conference on Empirical Methods in Natural Language Processing, EMNLP","author":"Ma J.","year":"2018","unstructured":"Ma , J. , Ganchev , K. , Weiss , D.: State-of-the-art Chinese Word Segmentation with Bi-LSTMs. In: Conference on Empirical Methods in Natural Language Processing, EMNLP , 2018 , pp. 4902 - 4908 . Ma, J., Ganchev, K., Weiss, D.: State-of-the-art Chinese Word Segmentation with Bi-LSTMs. In: Conference on Empirical Methods in Natural Language Processing, EMNLP, 2018, pp. 4902-4908."},{"key":"e_1_3_2_1_20_1","volume-title":"Mian","author":"Aafaq X.","year":"2018","unstructured":"Aafaq , X. , Gilani , S.Z. , Liu , W. , Mian , A. : Video Description: A Survey of Methods, Datasets and Evaluation Metrics. CoRR , 2018 , arXiv:1806.00186 Aafaq, X., Gilani, S.Z., Liu, W., Mian, A.: Video Description: A Survey of Methods, Datasets and Evaluation Metrics. CoRR, 2018, arXiv:1806.00186"},{"volume-title":"ACM on Multimedia Conference. ACM","author":"Yi B.","key":"e_1_3_2_1_21_1","unstructured":"Yi B. , Yang Y. , Shen F. , : Bidirectional Long-Short Term Memory for Video Description , ACM on Multimedia Conference. ACM , 2016: 436-440. Yi B., Yang Y., Shen F., : Bidirectional Long-Short Term Memory for Video Description, ACM on Multimedia Conference. ACM, 2016:436-440."},{"key":"e_1_3_2_1_22_1","volume-title":"Behrad","author":"Nabati M.","year":"2020","unstructured":"Nabati , M. , Behrad , A. : Video captioning using boosted and parallel Long Short-Term Memory networks, In : Comput. Vis. Image Underst , 2020 , v. 190. Nabati, M., Behrad, A.: Video captioning using boosted and parallel Long Short-Term Memory networks, In: Comput. Vis. Image Underst, 2020, v. 190."},{"key":"e_1_3_2_1_23_1","first-page":"24","volume-title":"Mao","author":"Xu Y.","year":"2019","unstructured":"Xu , Y. , Yang , J. , Mao , K. : Semantic-filtered Soft-Split-Aware video captioning with audio-augmented featur, In : Neurocomputing , 2019 , v. 357, pp. 24 - 35 Xu, Y., Yang, J., Mao, K.: Semantic-filtered Soft-Split-Aware video captioning with audio-augmented featur, In: Neurocomputing, 2019, v. 357, pp.24-35"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1117"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"Wang X. Wu J. Chen J. Li L. Wang Y.F.: Wang W.Y : VaTeX: A Large-Scale High-Quality Multilingual Dataset for Video-and-Language Research. ICCV\u20022019:  4580-4590  Wang X. Wu J. Chen J. Li L. Wang Y.F.: Wang W.Y : VaTeX: A Large-Scale High-Quality Multilingual Dataset for Video-and-Language Research. ICCV\u20022019: 4580-4590","DOI":"10.1109\/ICCV.2019.00468"},{"key":"e_1_3_2_1_26_1","volume-title":"Multi-modal Feature Fusion with Feature Attention for VATEX Captioning Challenge","author":"Lin K.","year":"2020","unstructured":"Lin K. , Gan Z. , Wang L. : Multi-modal Feature Fusion with Feature Attention for VATEX Captioning Challenge 2020 , 2020, CoRR\u2002abs\/ 2006.03315 Lin K., Gan Z., Wang L.: Multi-modal Feature Fusion with Feature Attention for VATEX Captioning Challenge 2020, 2020, CoRR\u2002abs\/2006.03315"}],"event":{"name":"CSAI 2020: 2020 4th International Conference on Computer Science and Artificial Intelligence","acronym":"CSAI 2020","location":"Zhuhai China"},"container-title":["2020 4th International Conference on Computer Science and Artificial Intelligence"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3445815.3445845","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3445815.3445845","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T21:24:33Z","timestamp":1750195473000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3445815.3445845"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,12,11]]},"references-count":26,"alternative-id":["10.1145\/3445815.3445845","10.1145\/3445815"],"URL":"https:\/\/doi.org\/10.1145\/3445815.3445845","relation":{},"subject":[],"published":{"date-parts":[[2020,12,11]]},"assertion":[{"value":"2021-03-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}