{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T17:26:23Z","timestamp":1769966783221,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":65,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,10,17]],"date-time":"2021-10-17T00:00:00Z","timestamp":1634428800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,10,17]]},"DOI":"10.1145\/3474085.3475321","type":"proceedings-article","created":{"date-parts":[[2021,10,18]],"date-time":"2021-10-18T05:04:15Z","timestamp":1634533455000},"page":"1756-1765","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":25,"title":["Multimodal Video Summarization via Time-Aware Transformers"],"prefix":"10.1145","author":[{"given":"Xindi","family":"Shang","sequence":"first","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]},{"given":"Zehuan","family":"Yuan","sequence":"additional","affiliation":[{"name":"Bytedance.Inc, Beijing, China"}]},{"given":"Anran","family":"Wang","sequence":"additional","affiliation":[{"name":"Bytedance.Inc, Singapore, Singapore"}]},{"given":"Changhu","family":"Wang","sequence":"additional","affiliation":[{"name":"ByteDance.Inc, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2021,10,17]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3355390"},{"key":"e_1_3_2_1_2_1","volume-title":"Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65--72","author":"Banerjee Satanjeev","year":"2005"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/2505515.2505652"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2014.2384912"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475173"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1438"},{"key":"e_1_3_2_1_7_1","volume-title":"Generating long sequences with sparse transformers. arXiv preprint arXiv:1904.10509","author":"Child Rewon","year":"2019"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2017.8268961"},{"key":"e_1_3_2_1_9_1","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","volume":"1","author":"Devlin Jacob","year":"2019"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2013.2267205"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10584-0_33"},{"key":"e_1_3_2_1_12_1","volume-title":"AAAI Conference on Artificial Intelligence.","author":"Hao Wangli","year":"2018"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00685"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/K19-1039"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.450"},{"key":"e_1_3_2_1_17_1","volume-title":"Batch normalization: Accelerating deep network training by reducing internal covariate shift. arXiv preprint arXiv:1502.03167","author":"Ioffe Sergey","year":"2015"},{"key":"e_1_3_2_1_18_1","volume-title":"Global-and-Local Relative Position Embedding for Unsupervised Video Summarization. ECCV","author":"Jung Yunjae","year":"2020"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00776"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00559"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"Ranjay Krishna Kenji Hata Frederic Ren Li Fei-Fei and Juan Carlos Niebles. 2017. Dense-captioning events in videos. In ICCV. 706--715.  Ranjay Krishna Kenji Hata Frederic Ren Li Fei-Fei and Juan Carlos Niebles. 2017. Dense-captioning events in videos. In ICCV. 706--715.","DOI":"10.1109\/ICCV.2017.83"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.5555\/3304222.3304347"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D17-1114"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.3115\/1218955.1219032"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.144"},{"key":"e_1_3_2_1_26_1","volume-title":"Generating wikipedia by summarizing long sequences. ICLR","author":"Liu Peter J","year":"2018"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.5555\/3454287.3454289"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.350"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33016810"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00990"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00272"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jvcir.2007.04.002"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCCSP.2017.7944061"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1659"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.111"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1012"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2984066"},{"key":"e_1_3_2_1_39_1","volume-title":"Qifan Wang, Santiago Ontanon, Sumit Kumar Sanghai, Vaclav Cvicek, and Zach Fisher.","author":"Ravula Anirudh","year":"2020"},{"key":"e_1_3_2_1_40_1","unstructured":"Ramon Sanabria Ozan Caglayan Shruti Palaskar Desmond Elliott Lo\"ic Barrault Lucia Specia and Florian Metze. 2018. How2: A Large-scale Dataset for Multimodal Language Understanding. In NeurIPS.  Ramon Sanabria Ozan Caglayan Shruti Palaskar Desmond Elliott Lo\"ic Barrault Lucia Specia and Florian Metze. 2018. How2: A Large-scale Dataset for Multimodal Language Understanding. In NeurIPS."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N18-2074"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.548"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.445"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1641"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1910"},{"key":"e_1_3_2_1_46_1","volume-title":"VL-BERT: Pre-training of Generic Visual-Linguistic Representations. In International Conference on Learning Representations.","author":"Su Weijie","year":"2019"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00756"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_16"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.5555\/3295222.3295349"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.515"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N16-1008"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00639"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123448"},{"key":"e_1_3_2_1_56_1","volume-title":"Modeling localness for self-attention networks. arXiv preprint arXiv:1810.10182","author":"Yang Baosong","year":"2018"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.512"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2654997"},{"key":"e_1_3_2_1_59_1","unstructured":"Manzil Zaheer Guru Guruganesh Avinava Dubey Joshua Ainslie Chris Alberti Santiago Ontanon Philip Pham Anirudh Ravula Qifan Wang Li Yang etal 2020. Big bird: Transformers for longer sequences. arXiv preprint arXiv:2007.14062 (2020).  Manzil Zaheer Guru Guruganesh Avinava Dubey Joshua Ainslie Chris Alberti Santiago Ontanon Philip Pham Anirudh Ravula Qifan Wang Li Yang et al. 2020. Big bird: Transformers for longer sequences. arXiv preprint arXiv:2007.14062 (2020)."},{"key":"e_1_3_2_1_60_1","volume-title":"Learning Visual Commonsense for Robust Scene Graph Generation. ECCV","author":"Zareian Alireza","year":"2020"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413518"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"crossref","unstructured":"Luowei Zhou Hamid Palangi Lei Zhang Houdong Hu Jason J Corso and Jianfeng Gao. 2020. Unified Vision-Language Pre-Training for Image Captioning and VQA.. In AAAI. 13041--13049.  Luowei Zhou Hamid Palangi Lei Zhang Houdong Hu Jason J Corso and Jianfeng Gao. 2020. Unified Vision-Language Pre-Training for Image Captioning and VQA.. In AAAI. 13041--13049.","DOI":"10.1609\/aaai.v34i07.7005"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-13-2122-1"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00911"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6525"}],"event":{"name":"MM '21: ACM Multimedia Conference","location":"Virtual Event China","acronym":"MM '21","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 29th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475321","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3474085.3475321","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:49:18Z","timestamp":1750193358000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475321"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,17]]},"references-count":65,"alternative-id":["10.1145\/3474085.3475321","10.1145\/3474085"],"URL":"https:\/\/doi.org\/10.1145\/3474085.3475321","relation":{},"subject":[],"published":{"date-parts":[[2021,10,17]]},"assertion":[{"value":"2021-10-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}