{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,8]],"date-time":"2026-02-08T04:49:28Z","timestamp":1770526168851,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":36,"publisher":"ACM","license":[{"start":{"date-parts":[[2020,10,12]],"date-time":"2020-10-12T00:00:00Z","timestamp":1602460800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Natural Science Foundation of China","award":["61772535"],"award-info":[{"award-number":["61772535"]}]},{"name":"Beijing Natural Science Foundation","award":["4192028"],"award-info":[{"award-number":["4192028"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2020,10,12]]},"DOI":"10.1145\/3394171.3413890","type":"proceedings-article","created":{"date-parts":[[2020,10,12]],"date-time":"2020-10-12T12:27:38Z","timestamp":1602505658000},"page":"2599-2607","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":22,"title":["VideoIC: A Video Interactive Comments Dataset and Multimodal Multitask Learning for Comments Generation"],"prefix":"10.1145","author":[{"given":"Weiying","family":"Wang","sequence":"first","affiliation":[{"name":"Renmin University of China, Beijing, China"}]},{"given":"Jieting","family":"Chen","sequence":"additional","affiliation":[{"name":"Renmin University of China, Beijing, China"}]},{"given":"Qin","family":"Jin","sequence":"additional","affiliation":[{"name":"Renmin University of China, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2020,10,12]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2019.2909054"},{"key":"e_1_3_2_2_2_1","volume-title":"International Conference on Learning Representations.","author":"Caiming Xiong Victor Zhong","year":"2017"},{"key":"e_1_3_2_2_3_1","unstructured":"Shizhe Chen Yuqing Song Yida Zhao Qin Jin Zhaoyang Zeng Bei Liu Jianlong Fu and Alexander Hauptmann. 2019. Activitynet 2019 Task 3: Exploring Contexts for Dense Captioning Events in Videos. arXiv preprint arXiv:1907.05092 (2019).  Shizhe Chen Yuqing Song Yida Zhao Qin Jin Zhaoyang Zeng Bei Liu Jianlong Fu and Alexander Hauptmann. 2019. Activitynet 2019 Task 3: Exploring Contexts for Dense Captioning Events in Videos. arXiv preprint arXiv:1907.05092 (2019)."},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3077136.3080776"},{"key":"e_1_3_2_2_5_1","unstructured":"Kyunghyun Cho Bart Van Merri\u00ebnboer Caglar Gulcehre Dzmitry Bahdanau Fethi Bougares Holger Schwenk and Yoshua Bengio. 2014. Learning phrase representations using RNN encoder-decoder for statistical machine translation. arXiv preprint arXiv:1406.1078 (2014).  Kyunghyun Cho Bart Van Merri\u00ebnboer Caglar Gulcehre Dzmitry Bahdanau Fethi Bougares Holger Schwenk and Yoshua Bengio. 2014. Learning phrase representations using RNN encoder-decoder for statistical machine translation. arXiv preprint arXiv:1406.1078 (2014)."},{"key":"e_1_3_2_2_6_1","volume-title":"Visual Dialog. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","author":"Das Abhishek","year":"2017"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.340"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/W14-3348"},{"key":"e_1_3_2_2_9_1","volume-title":"Workshop on Text Summarization Branches Out.","author":"Flick Carlos","year":"2004"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00680"},{"key":"e_1_3_2_2_11_1","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2015. Deep Residual Learning for Image Recognition. arXiv preprint arXiv:1512.03385 (2015).  Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2015. Deep Residual Learning for Image Recognition. arXiv preprint arXiv:1512.03385 (2015)."},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"crossref","unstructured":"Sepp Hochreiter and J\u00fcrgen Schmidhuber. 1997. Long short-term memory. Neural computation Vol. 9 8 (1997) 1735--1780.  Sepp Hochreiter and J\u00fcrgen Schmidhuber. 1997. Long short-term memory. Neural computation Vol. 9 8 (1997) 1735--1780.","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"e_1_3_2_2_13_1","unstructured":"Jin-Hwa Kim Jaehyun Jun and Byoung-Tak Zhang. 2018. Bilinear Attention Networks. In Advances in Neural Information Processing Systems 31. 1571--1581.  Jin-Hwa Kim Jaehyun Jun and Byoung-Tak Zhang. 2018. Bilinear Attention Networks. In Advances in Neural Information Processing Systems 31. 1571--1581."},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"e_1_3_2_2_15_1","unstructured":"Alex Krizhevsky Ilya Sutskever and Geoffrey E Hinton. 2012. Imagenet classification with deep convolutional neural networks. In Advances in neural information processing systems. 1097--1105.  Alex Krizhevsky Ilya Sutskever and Geoffrey E Hinton. 2012. Imagenet classification with deep convolutional neural networks. In Advances in neural information processing systems. 1097--1105."},{"key":"e_1_3_2_2_16_1","volume-title":"TVQA: Localized, Compositional Video Question Answering. In EMNLP.","author":"Lei Jie","year":"2018"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"crossref","unstructured":"Jie Lei Licheng Yu Tamara L Berg and Mohit Bansal. 2019. TVQA+: Spatio-Temporal Grounding for Video Question Answering. arXiv preprint arXiv:1904.11574 (2019).  Jie Lei Licheng Yu Tamara L Berg and Mohit Bansal. 2019. TVQA+: Spatio-Temporal Grounding for Video Question Answering. arXiv preprint arXiv:1904.11574 (2019).","DOI":"10.18653\/v1\/2020.acl-main.730"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2019.2946477"},{"key":"e_1_3_2_2_19_1","unstructured":"Jiasen Lu Jianwei Yang Dhruv Batra and Devi Parikh. 2016. Hierarchical question-image co-attention for visual question answering. In Advances in neural information processing systems. 289--297.  Jiasen Lu Jianwei Yang Dhruv Batra and Devi Parikh. 2016. Hierarchical question-image co-attention for visual question answering. In Advances in neural information processing systems. 289--297."},{"key":"e_1_3_2_2_20_1","unstructured":"Guangyi Lv Kun Zhang Le Wu Enhong Chen Tong Xu Qi Liu and Weidong He. 2019. Understanding the Users and Videos by Mining a Novel Danmu Dataset. IEEE Transactions on Big Data (2019).  Guangyi Lv Kun Zhang Le Wu Enhong Chen Tong Xu Qi Liu and Weidong He. 2019. Understanding the Users and Videos by Mining a Novel Danmu Dataset. IEEE Transactions on Big Data (2019)."},{"key":"e_1_3_2_2_21_1","volume-title":"LiveBot: Generating Live Video Comments Based on Visual and Textual Contexts. In AAAI","author":"Ma Shuming","year":"2019"},{"key":"e_1_3_2_2_22_1","unstructured":"Brian McFee Matt McVicar Stefan Balke Carl Thom\u00e9 Vincent Lostanlen Colin Raffel Dana Lee Oriol Nieto Eric Battenberg Dan Ellis Ryuichi Yamamoto Josh Moore WZY Rachel Bittner Keunwoo Choi Pius Friesch Fabian-Robert St\u00f6ter Matt Vollrath Siddhartha Kumar nehz Simon Waloschek Seth Rimvydas Naktinis Douglas Repetto Curtis \"Fjord\" Hawthorne CJ Carr Jo\u00e3o Felipe Santos JackieWu Erik and Adrian Holovaty. 2018. librosa\/librosa: 0.6.2. https:\/\/doi.org\/10.5281\/zenodo.1342708  Brian McFee Matt McVicar Stefan Balke Carl Thom\u00e9 Vincent Lostanlen Colin Raffel Dana Lee Oriol Nieto Eric Battenberg Dan Ellis Ryuichi Yamamoto Josh Moore WZY Rachel Bittner Keunwoo Choi Pius Friesch Fabian-Robert St\u00f6ter Matt Vollrath Siddhartha Kumar nehz Simon Waloschek Seth Rimvydas Naktinis Douglas Repetto Curtis \"Fjord\" Hawthorne CJ Carr Jo\u00e3o Felipe Santos JackieWu Erik and Adrian Holovaty. 2018. librosa\/librosa: 0.6.2. https:\/\/doi.org\/10.5281\/zenodo.1342708"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00675"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00637"},{"key":"e_1_3_2_2_25_1","volume-title":"Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics (ACL).","author":"Papineni Kishore","year":"2002"},{"key":"e_1_3_2_2_26_1","unstructured":"I Sutskever O Vinyals and QV Le. 2014. Sequence to sequence learning with neural networks. Advances in NIPS (2014).  I Sutskever O Vinyals and QV Le. 2014. Sequence to sequence learning with neural networks. Advances in NIPS (2014)."},{"key":"e_1_3_2_2_27_1","volume-title":"COIN: A Large-scale Dataset for Comprehensive Instructional Video Analysis. In IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","author":"Tang Yansong","year":"2019"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"crossref","unstructured":"Ramakrishna Vedantam C. Lawrence Zitnick and Devi Parikh. 2015. CIDEr: Consensus-based Image Description Evaluation. In CVPR.  Ramakrishna Vedantam C. Lawrence Zitnick and Devi Parikh. 2015. CIDEr: Consensus-based Image Description Evaluation. In CVPR.","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1517"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2018.2877694"},{"key":"e_1_3_2_2_32_1","volume-title":"Herding Effect Based Attention for Personalized Time-Sync Video Recommendation. In 2019 IEEE International Conference on Multimedia and Expo (ICME). IEEE, 454--459","author":"Yang Wenmian","year":"2019"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"crossref","unstructured":"Wenmain Yang Kun Wang Na Ruan Wenyuan Gao Weijia Jia Wei Zhao Nan Liu and Yunyong Zhang. 2019 b. Time-Sync Video Tag Extraction Using Semantic Association Graph. ACM Transactions on Knowledge Discovery from Data (TKDD) Vol. 13 4 (2019) 1--24.  Wenmain Yang Kun Wang Na Ruan Wenyuan Gao Weijia Jia Wei Zhao Nan Liu and Yunyong Zhang. 2019 b. Time-Sync Video Tag Extraction Using Semantic Association Graph. ACM Transactions on Knowledge Discovery from Data (TKDD) Vol. 13 4 (2019) 1--24.","DOI":"10.1145\/3332932"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00644"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.202"},{"key":"e_1_3_2_2_36_1","volume-title":"Towards Automatic Learning of Procedures From Web Instructional Videos. In AAAI Conference on Artificial Intelligence. 7590--7598","author":"Zhou Luowei","year":"2018"}],"event":{"name":"MM '20: The 28th ACM International Conference on Multimedia","location":"Seattle WA USA","acronym":"MM '20","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 28th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3394171.3413890","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3394171.3413890","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T21:32:06Z","timestamp":1750195926000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3394171.3413890"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,10,12]]},"references-count":36,"alternative-id":["10.1145\/3394171.3413890","10.1145\/3394171"],"URL":"https:\/\/doi.org\/10.1145\/3394171.3413890","relation":{},"subject":[],"published":{"date-parts":[[2020,10,12]]},"assertion":[{"value":"2020-10-12","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}