{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T05:05:42Z","timestamp":1750309542032,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,31]],"date-time":"2025-03-31T00:00:00Z","timestamp":1743379200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,31]]},"DOI":"10.1145\/3712676.3714447","type":"proceedings-article","created":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T13:13:23Z","timestamp":1742994803000},"page":"159-169","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["SAMPL: Self-Attention Modelled Patch Learning for Efficient Visual Understanding"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5465-2819","authenticated-orcid":false,"given":"Zhiming","family":"Hu","sequence":"first","affiliation":[{"name":"Samsung AI Center, Toronto, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6020-3406","authenticated-orcid":false,"given":"Salar Hosseini","family":"Khorasgani","sequence":"additional","affiliation":[{"name":"Tenstorrent, Toronto, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-1519-6710","authenticated-orcid":false,"given":"Weiming","family":"Ren","sequence":"additional","affiliation":[{"name":"Samsung AI Center, Toronto, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-0598-8966","authenticated-orcid":false,"given":"Iqbal","family":"Mohomed","sequence":"additional","affiliation":[{"name":"Samsung AI Center, Toronto, Canada"}]}],"member":"320","published-online":{"date-parts":[[2025,3,31]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Wele Gedara Chaminda Bandara Naman Patel Ali Gholami Mehdi Nikkhah Motilal Agrawal and Vishal M Patel. 2023. AdaMAE: Adaptive Masking for Efficient Spatiotemporal Learning with Masked Autoencoders. In CVPR. 14507--14517.","DOI":"10.1109\/CVPR52729.2023.01394"},{"key":"e_1_3_2_1_2_1","unstructured":"Gedas Bertasius Heng Wang and Lorenzo Torresani. 2021. Is Space-Time Attention All You Need for Video Understanding?. In ICML."},{"key":"e_1_3_2_1_3_1","volume-title":"Token Merging: Your ViT but Faster. arXiv preprint arXiv:2210.09461","author":"Bolya Daniel","year":"2022","unstructured":"Daniel Bolya, Cheng-Yang Fu, Xiaoliang Dai, Peizhao Zhang, Christoph Feichtenhofer, and Judy Hoffman. 2022. Token Merging: Your ViT but Faster. arXiv preprint arXiv:2210.09461 (2022)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Mathilde Caron Hugo Touvron Ishan Misra Herv\u00e9 J\u00e9gou Julien Mairal Piotr Bojanowski and Armand Joulin. 2021. Emerging Properties in Self-Supervised Vision Transformers. In ICCV. 9650--9660.","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"e_1_3_2_1_5_1","unstructured":"Rohan Choudhury Guanglei Zhu Sihan Liu Koichiro Niinuma Kris M Kitani and L\u00e1szl\u00f3 Jeni. 2024. Don't Look Twice: Faster Video Transformers with Run-Length Tokenization."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Jean-Baptiste Cordonnier Aravindh Mahendran Alexey Dosovitskiy Dirk Weissenborn Jakob Uszkoreit and Thomas Unterthiner. 2021. Differentiable Patch Selection for Image Recognition. In CVPR. 2351--2360.","DOI":"10.1109\/CVPR46437.2021.00238"},{"volume-title":"ImageNet: A Large-Scale Hierarchical Image Database","author":"Deng Jia","key":"e_1_3_2_1_7_1","unstructured":"Jia Deng, Wei Dong, Richard Socher, Li-Jia Li, Kai Li, and Li Fei-Fei. 2009. ImageNet: A Large-Scale Hierarchical Image Database. In CVPR. IEEE, 248--255."},{"key":"e_1_3_2_1_8_1","volume-title":"Words: Transformers for Image Recognition at Scale. arXiv preprint arXiv:2010.11929","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, et al. 2020. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Haoqi Fan Bo Xiong Karttikeya Mangalam Yanghao Li Zhicheng Yan Jitendra Malik and Christoph Feichtenhofer. 2021. Multiscale Vision Transformers. In ICCV. 6824--6835.","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"e_1_3_2_1_10_1","volume-title":"Farnoush Rezaei Jafari, Sunando Sengupta, Hamid Reza Vaezi Joze, Eric Sommerlade, Hamed Pirsiavash, and J\u00fcrgen Gall.","author":"Fayyaz Mohsen","year":"2022","unstructured":"Mohsen Fayyaz, Soroush Abbasi Koohpayegani, Farnoush Rezaei Jafari, Sunando Sengupta, Hamid Reza Vaezi Joze, Eric Sommerlade, Hamed Pirsiavash, and J\u00fcrgen Gall. 2022. Adaptive Token Sampling for Efficient Vision Transformers. In ECCV. Springer, 396--414."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"Christoph Feichtenhofer. 2020. X3D: Expanding Architectures for Efficient Video Recognition. In CVPR. 203--213.","DOI":"10.1109\/CVPR42600.2020.00028"},{"key":"e_1_3_2_1_12_1","first-page":"35946","article-title":"Masked Autoencoders as Spatiotemporal Learners","volume":"35","author":"Feichtenhofer Christoph","year":"2022","unstructured":"Christoph Feichtenhofer, Haoqi Fan, Yanghao Li, and Kaiming He. 2022. Masked Autoencoders as Spatiotemporal Learners. NeurIPS 35 (2022), 35946--35958.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Christoph Feichtenhofer Haoqi Fan Jitendra Malik and Kaiming He. 2019. Slow-fast Networks for Video Recognition. In ICCV. 6202--6211.","DOI":"10.1109\/ICCV.2019.00630"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1016\/0304-3975(85)90224-5"},{"key":"e_1_3_2_1_15_1","volume-title":"Vincent Michalski, Joanna Materzynska, Susanne Westphal, Heuna Kim, Valentin Haenel, Ingo Fruend, Peter Yianilos, Moritz Mueller-Freitag, et al.","author":"Goyal Raghav","year":"2017","unstructured":"Raghav Goyal, Samira Ebrahimi Kahou, Vincent Michalski, Joanna Materzynska, Susanne Westphal, Heuna Kim, Valentin Haenel, Ingo Fruend, Peter Yianilos, Moritz Mueller-Freitag, et al. 2017. The \"Something Something\" Video Database for Learning and Evaluating Visual Common Sense. In CVPR. 5842--5850."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Sariel Har-Peled. 2011. Geometric Approximation Algorithms. Number 173. American Mathematical Soc.","DOI":"10.1090\/surv\/173"},{"key":"e_1_3_2_1_17_1","unstructured":"Kaiming He Xinlei Chen Saining Xie Yanghao Li Piotr Doll\u00e1r and Ross Girshick. 2022. Masked Autoencoders are Scalable Vision Learners. In CVPR. 16000--16009."},{"key":"e_1_3_2_1_18_1","unstructured":"Angelos Katharopoulos and Fran\u00e7ois Fleuret. 2019. Processing Megapixel Images with Deep Attention-Sampling Models. In ICML. 3282--3291."},{"key":"e_1_3_2_1_19_1","unstructured":"Will Kay Joao Carreira Karen Simonyan Brian Zhang Chloe Hillier Sudheendra Vijayanarasimhan Fabio Viola Tim Green Trevor Back Paul Natsev et al. 2017. The Kinetics Human Action Video Dataset. arXiv preprint arXiv:1705.06950 (2017)."},{"key":"e_1_3_2_1_20_1","volume-title":"On Information and Sufficiency. The annals of mathematical statistics 22, 1","author":"Kullback Solomon","year":"1951","unstructured":"Solomon Kullback and Richard A Leibler. 1951. On Information and Sufficiency. The annals of mathematical statistics 22, 1 (1951), 79--86."},{"key":"e_1_3_2_1_21_1","unstructured":"Youwei Liang GE Chongjian Zhan Tong Yibing Song Jue Wang and Pengtao Xie. 2021. EViT: Expediting Vision Transformers via Token Reorganizations. In ICLR."},{"key":"e_1_3_2_1_22_1","volume-title":"TSM: Temporal Shift Module for Efficient Video Understanding. In ICCV. 7083--7093.","author":"Lin Ji","year":"2019","unstructured":"Ji Lin, Chuang Gan, and Song Han. 2019. TSM: Temporal Shift Module for Efficient Video Understanding. In ICCV. 7083--7093."},{"key":"e_1_3_2_1_23_1","volume-title":"Anurag Ranjan, Anish Prabhu, Mohammad Rastegari, and Oncel Tuzel.","author":"Marin Dmitrii","year":"2021","unstructured":"Dmitrii Marin, Jen-Hao Rick Chang, Anurag Ranjan, Anish Prabhu, Mohammad Rastegari, and Oncel Tuzel. 2021. Token Pooling in Vision Transformers. arXiv preprint arXiv:2110.03860 (2021)."},{"key":"e_1_3_2_1_24_1","first-page":"23296","article-title":"Intriguing Properties of Vision Transformers","volume":"34","author":"Naseer Muhammad Muzammal","year":"2021","unstructured":"Muhammad Muzammal Naseer, Kanchana Ranasinghe, Salman H Khan, Munawar Hayat, Fahad Shahbaz Khan, and Ming-Hsuan Yang. 2021. Intriguing Properties of Vision Transformers. NeurIPS 34 (2021), 23296--23308.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_25_1","first-page":"24898","article-title":"IA-RED2: Interpretability-Aware Redundancy Reduction for Vision Transformers","volume":"34","author":"Pan Bowen","year":"2021","unstructured":"Bowen Pan, Rameswar Panda, Yifan Jiang, Zhangyang Wang, Rogerio Feris, and Aude Oliva. 2021. IA-RED2: Interpretability-Aware Redundancy Reduction for Vision Transformers. NeurIPS 34 (2021), 24898--24911.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_26_1","unstructured":"Zizheng Pan Bohan Zhuang Jing Liu Haoyu He and Jianfei Cai. 2021. Scalable Vision Transformers with Hierarchical Pooling. In ICCV. 377--386."},{"volume-title":"K-Centered Patch Sampling for Efficient Video Recognition","author":"Park Seong Hyeon","key":"e_1_3_2_1_27_1","unstructured":"Seong Hyeon Park, Jihoon Tack, Byeongho Heo, Jung-Woo Ha, and Jinwoo Shin. 2022. K-Centered Patch Sampling for Efficient Video Recognition. In ECCV. Springer, 160--176."},{"key":"e_1_3_2_1_28_1","volume-title":"Mar: Masked Autoencoders for Efficient Action Recognition","author":"Qing Zhiwu","year":"2023","unstructured":"Zhiwu Qing, Shiwei Zhang, Ziyuan Huang, Xiang Wang, Yuehuan Wang, Yiliang Lv, Changxin Gao, and Nong Sang. 2023. Mar: Masked Autoencoders for Efficient Action Recognition. IEEE Transactions on Multimedia (2023)."},{"key":"e_1_3_2_1_29_1","first-page":"13937","article-title":"DynamicViT: Efficient Vision Transformers with Dynamic Token Sparsification","volume":"34","author":"Rao Yongming","year":"2021","unstructured":"Yongming Rao, Wenliang Zhao, Benlin Liu, Jiwen Lu, Jie Zhou, and Cho-Jui Hsieh. 2021. DynamicViT: Efficient Vision Transformers with Dynamic Token Sparsification. NeurIPS 34 (2021), 13937--13949.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"Adria Recasens Petr Kellnhofer Simon Stent Wojciech Matusik and Antonio Torralba. 2018. Learning to Zoom: a Saliency-Based Sampling Layer for Neural Networks. In ECCV. 51--66.","DOI":"10.1007\/978-3-030-01240-3_4"},{"key":"e_1_3_2_1_31_1","volume-title":"Tokenlearner: What Can 8 Learned Tokens Do for Images and Videos? arXiv preprint arXiv:2106.11297","author":"Ryoo Michael S","year":"2021","unstructured":"Michael S Ryoo, AJ Piergiovanni, Anurag Arnab, Mostafa Dehghani, and Anelia Angelova. 2021. Tokenlearner: What Can 8 Learned Tokens Do for Images and Videos? arXiv preprint arXiv:2106.11297 (2021)."},{"key":"e_1_3_2_1_32_1","volume-title":"Amir Roshan Zamir, and Mubarak Shah","author":"Soomro Khurram","year":"2012","unstructured":"Khurram Soomro, Amir Roshan Zamir, and Mubarak Shah. 2012. UCF101: A Dataset of 101 Human Actions Classes from Videos in the Wild. arXiv preprint arXiv:1212.0402 (2012)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"Yehui Tang Kai Han Yunhe Wang Chang Xu Jianyuan Guo Chao Xu and Dacheng Tao. 2022. Patch Slimming for Efficient Vision Transformers. In CVPR. 12165--12174.","DOI":"10.1109\/CVPR52688.2022.01185"},{"key":"e_1_3_2_1_34_1","first-page":"10078","article-title":"VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-training","volume":"35","author":"Tong Zhan","year":"2022","unstructured":"Zhan Tong, Yibing Song, Jue Wang, and Limin Wang. 2022. VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-training. NeurIPS 35 (2022), 10078--10093.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_35_1","first-page":"10347","article-title":"Training Data-Efficient Image Transformers Distillation Through Attention","volume":"139","author":"Touvron Hugo","year":"2021","unstructured":"Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, and Herve Jegou. 2021. Training Data-Efficient Image Transformers Distillation Through Attention. In ICML, Vol. 139. 10347--10357.","journal-title":"ICML"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"crossref","unstructured":"Burak Uzkent and Stefano Ermon. 2020. Learning When and Where to Zoom with Deep Reinforcement Learning. In CVPR. 12345--12354.","DOI":"10.1109\/CVPR42600.2020.01236"},{"key":"e_1_3_2_1_37_1","volume-title":"Attention is All You Need. NeurIPS 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is All You Need. NeurIPS 30 (2017)."},{"volume-title":"Efficient Video Transformers with Spatial-Temporal Token Selection","author":"Wang Junke","key":"e_1_3_2_1_38_1","unstructured":"Junke Wang, Xitong Yang, Hengduo Li, Li Liu, Zuxuan Wu, and Yu-Gang Jiang. 2022. Efficient Video Transformers with Spatial-Temporal Token Selection. In ECCV. Springer, 69--86."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"Xiaolong Wang Ross Girshick Abhinav Gupta and Kaiming He. 2018. Non-Local Neural Networks. In CVPR. 7794--7803.","DOI":"10.1109\/CVPR.2018.00813"},{"key":"e_1_3_2_1_40_1","volume-title":"Simple Statistical Gradient-Following Algorithms for Connectionist Reinforcement Learning. Machine learning 8","author":"Williams Ronald J","year":"1992","unstructured":"Ronald J Williams. 1992. Simple Statistical Gradient-Following Algorithms for Connectionist Reinforcement Learning. Machine learning 8 (1992), 229--256."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20202"},{"key":"e_1_3_2_1_42_1","unstructured":"Hongxu Yin Arash Vahdat Jose M Alvarez Arun Mallya Jan Kautz and Pavlo Molchanov. 2022. A-ViT: Adaptive Tokens for Efficient Vision Transformer. In CVPR. 10809--10818."},{"key":"e_1_3_2_1_43_1","volume-title":"Wayne Zhang, and Dahua Lin.","author":"Yue Xiaoyu","year":"2021","unstructured":"Xiaoyu Yue, Shuyang Sun, Zhanghui Kuang, Meng Wei, Philip HS Torr, Wayne Zhang, and Dahua Lin. 2021. Vision Transformer with Progressive Sampling. In ICCV. 387--396."},{"volume-title":"Self-Slimmed Vision Transformer","author":"Zong Zhuofan","key":"e_1_3_2_1_44_1","unstructured":"Zhuofan Zong, Kunchang Li, Guanglu Song, Yali Wang, Yu Qiao, Biao Leng, and Yu Liu. 2022. Self-Slimmed Vision Transformer. In ECCV. Springer, 432--448."}],"event":{"name":"MMSys '25: 16th ACM Multimedia Systems Conference","sponsor":["SIGMM ACM Special Interest Group on Multimedia","SIGCOMM ACM Special Interest Group on Data Communication","SIGMOBILE ACM Special Interest Group on Mobility of Systems, Users, Data and Computing"],"location":"Stellenbosch South Africa","acronym":"MMSys '25"},"container-title":["Proceedings of the 16th ACM Multimedia Systems Conference"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3712676.3714447","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:38Z","timestamp":1750295918000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3712676.3714447"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,31]]},"references-count":44,"alternative-id":["10.1145\/3712676.3714447","10.1145\/3712676"],"URL":"https:\/\/doi.org\/10.1145\/3712676.3714447","relation":{},"subject":[],"published":{"date-parts":[[2025,3,31]]},"assertion":[{"value":"2025-03-31","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}