{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,21]],"date-time":"2026-04-21T16:21:15Z","timestamp":1776788475976,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":59,"publisher":"ACM","funder":[{"name":"Institute of Information & communications Technology Planning & Evaluation","award":["RS-2024-00419073"],"award-info":[{"award-number":["RS-2024-00419073"]}]},{"name":"Institute of Information & communications Technology Planning & Evaluation","award":["RS-2024-00436934"],"award-info":[{"award-number":["RS-2024-00436934"]}]},{"name":"Institute of Information & communications Technology Planning & Evaluation","award":["RS-2023-00254129"],"award-info":[{"award-number":["RS-2023-00254129"]}]},{"name":"Institute of Information & communications Technology Planning & Evaluation","award":["RS-2024-00436936"],"award-info":[{"award-number":["RS-2024-00436936"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,4]]},"DOI":"10.1145\/3793853.3795749","type":"proceedings-article","created":{"date-parts":[[2026,4,6]],"date-time":"2026-04-06T14:25:49Z","timestamp":1775485549000},"page":"84-95","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["MARs: Multi-Scale Convolution-Attention residual Fusion for Video Summarization"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-8905-7063","authenticated-orcid":false,"given":"Joon-Seok","family":"Song","sequence":"first","affiliation":[{"name":"Sungkyunkwan University, Seoul, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0686-1712","authenticated-orcid":false,"given":"Juyeob","family":"Lee","sequence":"additional","affiliation":[{"name":"Sungkyunkwan University, Seoul, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3177-3538","authenticated-orcid":false,"given":"Eunil","family":"Park","sequence":"additional","affiliation":[{"name":"Sungkyunkwan University, Seoul, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,4,6]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Video Summarization Techniques: A Comprehensive Review. arXiv preprint arXiv:2410.04449","author":"Alaa Toqa","year":"2024","unstructured":"Toqa Alaa, Ahmad Mongy, Assem Bakr, Mariam Diab, and Walid Gomaa. 2024. Video Summarization Techniques: A Comprehensive Review. arXiv preprint arXiv:2410.04449 (2024)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2020.3037883"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISM52913.2021.00045"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-020-10460-0"},{"key":"e_1_3_2_1_5_1","volume-title":"Yu Yan, Felix Juefei Xu, and Chen Feng.","author":"Chen Chao","year":"2023","unstructured":"Chao Chen, Mingzhi Zhu, Ankush Pratap Singh, Yu Yan, Felix Juefei Xu, and Chen Feng. 2023. Scene summarization: Clustering scene videos into spatially diverse frames. arXiv preprint arXiv:2311.17940 (2023)."},{"key":"e_1_3_2_1_6_1","volume-title":"Coatnet: Marrying convolution and attention for all data sizes. Advances in neural information processing systems 34","author":"Dai Zihang","year":"2021","unstructured":"Zihang Dai, Hanxiao Liu, Quoc V Le, and Mingxing Tan. 2021. Coatnet: Marrying convolution and attention for all data sizes. Advances in neural information processing systems 34 (2021), 3965\u20133977."},{"key":"e_1_3_2_1_7_1","volume-title":"Transformer-xl: Attentive language models beyond a fixed-length context. arXiv preprint arXiv:1901.02860","author":"Dai Zihang","year":"2019","unstructured":"Zihang Dai, Zhilin Yang, Yiming Yang, Jaime Carbonell, Quoc V Le, and Ruslan Salakhutdinov. 2019. Transformer-xl: Attentive language models beyond a fixed-length context. arXiv preprint arXiv:1901.02860 (2019)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_9_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xi-aohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_10_1","volume-title":"Asian conference on computer vision. Springer, 39\u201354","author":"Fajtl Jiri","year":"2018","unstructured":"Jiri Fajtl, Hajar Sadeghi Sokeh, Vasileios Argyriou, Dorothy Monekosso, and Paolo Remagnino. 2018. Summarizing videos with attention. In Asian conference on computer vision. Springer, 39\u201354."},{"key":"e_1_3_2_1_11_1","volume-title":"Supervised video summarization via multiple feature sets with parallel attention. arXiv preprint arXiv:2104.11530","author":"Ghauri Junaid Ahmed","year":"2021","unstructured":"Junaid Ahmed Ghauri, Sherzod Hakimov, and Ralph Ewerth. 2021. Supervised video summarization via multiple feature sets with parallel attention. arXiv preprint arXiv:2104.11530 (2021)."},{"key":"e_1_3_2_1_12_1","volume-title":"Conformer: Convolution-augmented transformer for speech recognition. arXiv preprint arXiv:2005.08100","author":"Gulati Anmol","year":"2020","unstructured":"Anmol Gulati, James Qin, Chung-Cheng Chiu, Niki Parmar, Yu Zhang, Jiahui Yu, Wei Han, Shibo Wang, Zhengdong Zhang, Yonghui Wu, et al. 2020. Conformer: Convolution-augmented transformer for speech recognition. arXiv preprint arXiv:2005.08100 (2020)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01186"},{"key":"e_1_3_2_1_14_1","volume-title":"Attention mechanisms in computer vision: A survey. Computational visual media 8, 3","author":"Guo Meng-Hao","year":"2022","unstructured":"Meng-Hao Guo, Tian-Xing Xu, Jiang-Jiang Liu, Zheng-Ning Liu, Peng-Tao Jiang, Tai-Jiang Mu, Song-Hai Zhang, Ralph R Martin, Ming-Ming Cheng, and Shi-Min Hu. 2022. Attention mechanisms in computer vision: A survey. Computational visual media 8, 3 (2022), 331\u2013368."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10584-0_33"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01428"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_18_1","volume-title":"International conference on machine learning. PMLR, 2790\u20132799","author":"Houlsby Neil","year":"2019","unstructured":"Neil Houlsby, Andrei Giurgiu, Stanislaw Jastrzebski, Bruna Morrone, Quentin De Laroussilhe, Andrea Gesmundo, Mona Attariyan, and Sylvain Gelly. 2019. Parameter-efficient transfer learning for NLP. In International conference on machine learning. PMLR, 2790\u20132799."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3275069"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.243"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2019.2904996"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01590"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3607540.3617139"},{"key":"e_1_3_2_1_24_1","volume-title":"Reducibility among combinatorial problems. In 50 Years of Integer Programming 1958-2008: from the Early Years to the State-of-the-Art","author":"Karp Richard M","unstructured":"Richard M Karp. 2009. Reducibility among combinatorial problems. In 50 Years of Integer Programming 1958-2008: from the Early Years to the State-of-the-Art. Springer, 219\u2013241."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1093\/biomet\/33.3.239"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1111\/exsy.13707"},{"key":"e_1_3_2_1_27_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01768"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3746252.3761180"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00554"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00138-025-01676-x"},{"key":"e_1_3_2_1_32_1","volume-title":"Understanding and improving transformer from a multi-particle dynamic system point of view. arXiv preprint arXiv:1906.02762","author":"Lu Yiping","year":"2019","unstructured":"Yiping Lu, Zhuohan Li, Di He, Zhiqing Sun, Bin Dong, Tao Qin, Liwei Wang, and Tie-Yan Liu. 2019. Understanding and improving transformer from a multi-particle dynamic system point of view. arXiv preprint arXiv:1906.02762 (2019)."},{"key":"e_1_3_2_1_33_1","volume-title":"Understanding the effective receptive field in deep convolutional neural networks. Advances in neural information processing systems 29","author":"Luo Wenjie","year":"2016","unstructured":"Wenjie Luo, Yujia Li, Raquel Urtasun, and Richard Zemel. 2016. Understanding the effective receptive field in deep convolutional neural networks. Advances in neural information processing systems 29 (2016)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.3390\/a18090572"},{"key":"e_1_3_2_1_35_1","unstructured":"Ghulam Mujtaba and Eun-Seok Ryu. 2025. EdgeVidSum: Real-Time Personalized Video Summarization at the Edge. arXiv:2506.03171 [cs.CV] https:\/\/arxiv.org\/abs\/2506.03171"},{"key":"e_1_3_2_1_36_1","volume-title":"Advances in neural information processing systems 34","author":"Narasimhan Medhini","year":"2021","unstructured":"Medhini Narasimhan, Anna Rohrbach, and Trevor Darrell. 2021. Clip-it! language-guided video summarization. Advances in neural information processing systems 34 (2021), 13988\u201314000."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2025\/855"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00778"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.3390\/app14114400"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10599-4_35"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.3389\/fdata.2022.1106776"},{"key":"e_1_3_2_1_42_1","volume-title":"Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556","author":"Simonyan Karen","year":"2014","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)."},{"key":"e_1_3_2_1_43_1","volume-title":"Video skimming for quick browsing based on audio and image characterization. School of Computer Science","author":"Smith Michael A","unstructured":"Michael A Smith and Takeo Kanade. 1995. Video skimming for quick browsing based on audio and image characterization. School of Computer Science, Carnegie Mellon University Pittsburgh, PA, USA."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01783"},{"key":"e_1_3_2_1_45_1","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition. 5179\u20135187","author":"Song Yale","year":"2015","unstructured":"Yale Song, Jordi Vallmitjana, Amanda Stent, and Alejandro Jaimes. 2015. Tvsum: Summarizing web videos using titles. In Proceedings of the IEEE conference on computer vision and pattern recognition. 5179\u20135187."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00316"},{"key":"e_1_3_2_1_48_1","volume-title":"Advances in Neural Information Processing Systems","volume":"30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, elax \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is All You Need. In Advances in Neural Information Processing Systems, Vol. 30. Curran Associates, Inc."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3614419.3644023"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3414064"},{"key":"e_1_3_2_1_51_1","volume-title":"Learning deep transformer models for machine translation. arXiv preprint arXiv:1906.01787","author":"Wang Qiang","year":"2019","unstructured":"Qiang Wang, Bei Li, Tong Xiao, Jingbo Zhu, Changliang Li, Derek F Wong, and Lidia S Chao. 2019. Learning deep transformer models for machine translation. arXiv preprint arXiv:1906.01787 (2019)."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00009"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46478-7_47"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3312325"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2021.10.039"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i12.17325"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3163855"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.3039886"},{"key":"e_1_3_2_1_59_1","volume-title":"CRC standard probability and statistics tables and formulae","author":"Zwillinger Daniel","unstructured":"Daniel Zwillinger and Stephen Kokoska. 1999. CRC standard probability and statistics tables and formulae. Crc Press."}],"event":{"name":"MMSys '26: ACM Multimedia Systems Conference 2026","location":"Hong Kong Hong Kong","acronym":"MMSys '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the ACM Multimedia Systems Conference 2026"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3793853.3795749","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,16]],"date-time":"2026-04-16T15:24:39Z","timestamp":1776353079000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3793853.3795749"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,4]]},"references-count":59,"alternative-id":["10.1145\/3793853.3795749","10.1145\/3793853"],"URL":"https:\/\/doi.org\/10.1145\/3793853.3795749","relation":{},"subject":[],"published":{"date-parts":[[2026,4,4]]},"assertion":[{"value":"2026-04-06","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}