{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T14:01:36Z","timestamp":1780063296655,"version":"3.54.0"},"publisher-location":"New York, NY, USA","reference-count":75,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,20]],"date-time":"2026-06-20T00:00:00Z","timestamp":1781913600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"Institute of Information & communications Technology Planning & Evaluation (IITP)","award":["RS-2024-00405128"],"award-info":[{"award-number":["RS-2024-00405128"]}]},{"name":"Institute of Information & communications Technology Planning & Evaluation (IITP)","award":["IITP-2026-RS-2024-00418784"],"award-info":[{"award-number":["IITP-2026-RS-2024-00418784"]}]},{"name":"National Research Foundation of Korea (NRF)","award":["RS-2022-NR070834"],"award-info":[{"award-number":["RS-2022-NR070834"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,21]]},"DOI":"10.1145\/3745756.3809212","type":"proceedings-article","created":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T12:52:21Z","timestamp":1780059141000},"page":"405-417","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Ouroboros: Instilling Motion Awareness in ViTs for Efficient Video Analytics on the Edge"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-3975-2566","authenticated-orcid":false,"given":"Chanjeong","family":"Park","sequence":"first","affiliation":[{"name":"Seoul National University, Seoul, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-3602-0720","authenticated-orcid":false,"given":"Donggyu","family":"Yang","sequence":"additional","affiliation":[{"name":"Seoul National University, Seoul, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8941-9601","authenticated-orcid":false,"given":"Sooyoung","family":"Kwon","sequence":"additional","affiliation":[{"name":"Seoul National University, Seoul, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-3659-2615","authenticated-orcid":false,"given":"Gibum","family":"Park","sequence":"additional","affiliation":[{"name":"Seoul National University, Seoul, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0785-9291","authenticated-orcid":false,"given":"Carlee","family":"Joe-Wong","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8647-1476","authenticated-orcid":false,"given":"Kyunghan","family":"Lee","sequence":"additional","affiliation":[{"name":"Seoul National University, Seoul, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,20]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Advanced Micro Devices Inc. Advanced Media Framework (AMF) SDK. https:\/\/gpuopen.com\/advanced-media-framework 2026."},{"key":"e_1_3_2_1_2_1","volume-title":"VideoToolbox Framework Documentation. https:\/\/developer.apple.com\/documentation\/videotoolbox","author":"Apple Inc.","year":"2026","unstructured":"Apple Inc. VideoToolbox Framework Documentation. https:\/\/developer.apple.com\/documentation\/videotoolbox, 2026."},{"key":"e_1_3_2_1_3_1","first-page":"6846","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","author":"Arnab A.","year":"2021","unstructured":"Arnab, A., Dehghani, M., Heigold, G., Sun, C., Lu\u010di\u0107, M., and Schmid, C. ViViT: A video vision transformer. In Proceedings of the IEEE\/CVF International Conference on Computer Vision (2021), pp. 6836\u20136846."},{"key":"e_1_3_2_1_4_1","first-page":"4","volume-title":"Proceedings of the International Conference on Machine Learning","volume":"2","author":"Bertasius G.","year":"2021","unstructured":"Bertasius, G., Wang, H., and Torresani, L. Is space-time attention all you need for video understanding? In Proceedings of the International Conference on Machine Learning (2021), vol. 2, p. 4."},{"key":"e_1_3_2_1_5_1","first-page":"424","volume-title":"Proceedings of the ACM MobiSys","author":"Bin K.","year":"2024","unstructured":"Bin, K., Park, J., Park, C., Kim, S., and Lee, K. CoActo: CoActive neural network inference offloading with fine-grained and concurrent execution. In Proceedings of the ACM MobiSys (2024), pp. 412\u2013424."},{"key":"e_1_3_2_1_6_1","volume-title":"Proceedings of the International Conference on Learning Representations","author":"Bolya D.","year":"2023","unstructured":"Bolya, D., Fu, C.-Y., Dai, X., Zhang, P., Feichtenhofer, C., and Hoffman, J. Token merging: Your ViT but faster. In Proceedings of the International Conference on Learning Representations (2023)."},{"key":"e_1_3_2_1_7_1","volume-title":"The OpenCV library. Dr. Dobb's Journal of Software Tools","author":"Bradski G.","year":"2000","unstructured":"Bradski, G. The OpenCV library. Dr. Dobb's Journal of Software Tools (2000)."},{"key":"e_1_3_2_1_8_1","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown T.","year":"2020","unstructured":"Brown, T., Mann, B., Ryder, N., Subbiah, M., Kaplan, J. D., Dhariwal, P., Neelakantan, A., Shyam, P., Sastry, G., Askell, A., et al. Language models are few-shot learners. Advances in Neural Information Processing Systems 33 (2020), 1877\u20131901.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_10_1","volume-title":"On the relationship between self-attention and convolutional layers. arXiv preprint arXiv:1911.03584","author":"Cordonnier J.-B.","year":"2019","unstructured":"Cordonnier, J.-B., Loukas, A., and Jaggi, M. On the relationship between self-attention and convolutional layers. arXiv preprint arXiv:1911.03584 (2019)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_12_1","volume-title":"Reviving shift equivariance in vision transformers. arXiv preprint arXiv:2306.07470","author":"Ding P.","year":"2023","unstructured":"Ding, P., Soselia, D., Armstrong, T., Su, J., and Huang, F. Reviving shift equivariance in vision transformers. arXiv preprint arXiv:2306.07470 (2023)."},{"key":"e_1_3_2_1_13_1","volume-title":"An image is worth 16\u00d716 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929","author":"Dosovitskiy A.","year":"2020","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., et al. An image is worth 16\u00d716 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_14_1","first-page":"570","volume-title":"Proceedings of the Annual Conference of the ACM SIGCOMM","author":"Du K.","year":"2020","unstructured":"Du, K., Pervaiz, A., Yuan, X., Chowdhery, A., Zhang, Q., Hoffmann, H., and Jiang, J. Server-driven video streaming for deep learning inference. In Proceedings of the Annual Conference of the ACM SIGCOMM (2020), pp. 557\u2013570."},{"key":"e_1_3_2_1_15_1","first-page":"16923","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","author":"Dutson M.","year":"2023","unstructured":"Dutson, M., Li, Y., and Gupta, M. Eventful transformers: Leveraging temporal redundancy in vision transformers. In Proceedings of the IEEE\/CVF International Conference on Computer Vision (2023), pp. 16911\u201316923."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-009-0275-4"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","first-page":"6","DOI":"10.1145\/358669.358692","article-title":"Random sample consensus: A paradigm for model fitting with applications to image analysis and automated cartography","volume":"24","author":"Fischler M. A.","year":"1981","unstructured":"Fischler, M. A., and Bolles, R. C. Random sample consensus: A paradigm for model fitting with applications to image analysis and automated cartography. Communications of the ACM 24, 6 (1981), 381\u2013395.","journal-title":"Communications of the ACM"},{"key":"e_1_3_2_1_18_1","first-page":"12269","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","author":"Graham B.","year":"2021","unstructured":"Graham, B., El-Nouby, A., Touvron, H., Stock, P., Joulin, A., J\u00e9gou, H., and Douze, M. LeViT: A vision transformer in ConvNet's clothing for faster inference. In Proceedings of the IEEE\/CVF International Conference on Computer Vision (2021), pp. 12259\u201312269."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_21_1","volume-title":"Shuffle transformer: Rethinking spatial shuffle for vision transformer. arXiv preprint arXiv:2106.03650","author":"Huang Z.","year":"2021","unstructured":"Huang, Z., Ben, Y., Luo, G., Cheng, P., Yu, G., and Fu, B. Shuffle transformer: Rethinking spatial shuffle for vision transformer. arXiv preprint arXiv:2106.03650 (2021)."},{"key":"e_1_3_2_1_22_1","first-page":"722","volume-title":"Proceedings of the USENIX Annual Technical Conference","author":"Hwang J.","year":"2022","unstructured":"Hwang, J., Kim, M., Kim, D., Nam, S., Kim, Y., Kim, D., Sharma, H., and Park, J. CoVA: Exploiting compressed-domain analysis to accelerate video analytics. In Proceedings of the USENIX Annual Technical Conference (2022), pp. 707\u2013722."},{"key":"e_1_3_2_1_23_1","volume-title":"Intel Video Processing Library (Intel VPL). https:\/\/www.intel.com\/content\/www\/us\/en\/developer\/tools\/vpl\/overview.html","author":"Intel Corporation","year":"2026","unstructured":"Intel Corporation. Intel Video Processing Library (Intel VPL). https:\/\/www.intel.com\/content\/www\/us\/en\/developer\/tools\/vpl\/overview.html, 2026."},{"key":"e_1_3_2_1_24_1","first-page":"10","article-title":"An end-to-end compression framework based on convolutional neural networks","volume":"28","author":"Jiang F.","year":"2017","unstructured":"Jiang, F., Tao, W., Liu, S., Ren, J., Guo, X., and Zhao, D. An end-to-end compression framework based on convolutional neural networks. IEEE Transactions on Circuits and Systems for Video Technology 28, 10 (2017), 3007\u20133018.","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3093337.3037698"},{"key":"e_1_3_2_1_26_1","first-page":"9123","volume-title":"Proceedings of the ACM MM","author":"Kim S.","year":"2023","unstructured":"Kim, S., Bin, K., Yang, D., Ha, S., Chong, S., and Lee, K. ENTRO: Tackling the encoding and networking trade-off in offloaded video analytics. In Proceedings of the ACM MM (2023), pp. 9115\u20139123."},{"key":"e_1_3_2_1_27_1","first-page":"15","volume-title":"Proceedings of the ACM MobiCom","author":"Laskaridis S.","year":"2020","unstructured":"Laskaridis, S., Venieris, S. I., Almeida, M., Leontiadis, I., and Lane, N. D. SPINN: Synergistic progressive inference of neural networks over device and cloud. In Proceedings of the ACM MobiCom (2020), pp. 1\u201315."},{"key":"e_1_3_2_1_28_1","first-page":"79","volume-title":"Proceedings of the IEEE International Conference on Mobile Ad Hoc and Smart Systems","author":"Lee S.","year":"2023","unstructured":"Lee, S., Nam, W., Lee, J., Ha, S., and Lee, K. N-epitomizer: Enabling semantic offloading for neural network inferences. In Proceedings of the IEEE International Conference on Mobile Ad Hoc and Smart Systems (2023), pp. 71\u201379."},{"key":"e_1_3_2_1_29_1","first-page":"1643","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","author":"Li K.","year":"2023","unstructured":"Li, K., Wang, Y., He, Y., Li, Y., Wang, Y., Wang, L., and Qiao, Y. Uniformerv2: Unlocking the potential of image ViTs for video understanding. In Proceedings of the IEEE\/CVF International Conference on Computer Vision (2023), pp. 1632\u20131643."},{"key":"e_1_3_2_1_30_1","first-page":"8","volume-title":"Advances in Neural Information Processing Systems Workshops","volume":"3","author":"Li Y.","year":"2021","unstructured":"Li, Y., Bejnordi, B. E., Moons, B., Blankevoort, T., Habibian, A., Timofte, R., and Van Gool, L. Spatio-temporal gated transformers for efficient video processing. In Advances in Neural Information Processing Systems Workshops (2021), vol. 3, p. 8."},{"key":"e_1_3_2_1_31_1","first-page":"296","volume-title":"Proceedings of the European Conference on Computer Vision","author":"Li Y.","year":"2022","unstructured":"Li, Y., Mao, H., Girshick, R., and He, K. Exploring plain vision transformer backbones for object detection. In Proceedings of the European Conference on Computer Vision (2022), pp. 280\u2013296."},{"key":"e_1_3_2_1_32_1","first-page":"376","volume-title":"Proceedings of the Annual Conference of the ACM SIGCOMM","author":"Li Y.","year":"2020","unstructured":"Li, Y., Padmanabhan, A., Zhao, P., Wang, Y., Xu, G. H., and Netravali, R. Reducto: On-camera filtering for resource-efficient real-time video analytics. In Proceedings of the Annual Conference of the ACM SIGCOMM (2020), pp. 359\u2013376."},{"key":"e_1_3_2_1_33_1","first-page":"4814","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Li Y.","year":"2022","unstructured":"Li, Y., Wu, C.-Y., Fan, H., Mangalam, K., Xiong, B., Malik, J., and Feichtenhofer, C. MViTv2: Improved multiscale vision transformers for classification and detection. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2022), pp. 4804\u20134814."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_35_1","first-page":"16","volume-title":"Proceedings of the ACM MobiCom","author":"Liu L.","year":"2019","unstructured":"Liu, L., Li, H., and Gruteser, M. Edge assisted real-time object detection for mobile augmented reality. In Proceedings of the ACM MobiCom (2019), pp. 1\u201316."},{"key":"e_1_3_2_1_36_1","first-page":"3044","volume-title":"Proceedings of the ACM MM","author":"Liu S.","year":"2022","unstructured":"Liu, S., Wang, T., Li, J., Sun, D., Srivastava, M., and Abdelzaher, T. AdaMask: Enabling machine-centric video streaming with adaptive frame masking for DNN inference offloading. In Proceedings of the ACM MM (2022), pp. 3035\u20133044."},{"key":"e_1_3_2_1_37_1","first-page":"12019","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Liu Z.","year":"2022","unstructured":"Liu, Z., Hu, H., Lin, Y., Yao, Z., Xie, Z., Wei, Y., Ning, J., Cao, Y., Zhang, Z., Dong, L., et al. Swin transformer v2: Scaling up capacity and resolution. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2022), pp. 12009\u201312019."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_39_1","first-page":"21","volume-title":"Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision","author":"Marin D.","year":"2023","unstructured":"Marin, D., Chang, J.-H. R., Ranjan, A., Prabhu, A., Rastegari, M., and Tuzel, O. Token pooling in vision transformers for image classification. In Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision (2023), pp. 12\u201321."},{"key":"e_1_3_2_1_40_1","first-page":"305","volume-title":"Proceedings of the ACM MobiSys","author":"Naderiparizi S.","year":"2017","unstructured":"Naderiparizi, S., Zhang, P., Philipose, M., Priyantha, B., Liu, J., and Ganesan, D. Glimpse: A programmable early-discard camera architecture for continuous mobile vision. In Proceedings of the ACM MobiSys (2017), pp. 292\u2013305."},{"key":"e_1_3_2_1_41_1","volume-title":"Matrix Multiplication Background User's Guide. https:\/\/docs.nvidia.com\/deeplearning\/performance\/dl-performance-matrix-multiplication\/index.html","author":"NVIDIA Corporation","year":"2023","unstructured":"NVIDIA Corporation. Matrix Multiplication Background User's Guide. https:\/\/docs.nvidia.com\/deeplearning\/performance\/dl-performance-matrix-multiplication\/index.html, 2023."},{"key":"e_1_3_2_1_42_1","volume-title":"NVIDIA Video Codec SDK. https:\/\/developer.nvidia.com\/video-codec-sdk","author":"NVIDIA Corporation","year":"2026","unstructured":"NVIDIA Corporation. NVIDIA Video Codec SDK. https:\/\/developer.nvidia.com\/video-codec-sdk, 2026."},{"key":"e_1_3_2_1_43_1","volume-title":"The 2017 DAVIS challenge on video object segmentation. arXiv preprint arXiv:1704.00675","author":"Pont-Tuset J.","year":"2017","unstructured":"Pont-Tuset, J., Perazzi, F., Caelles, S., Arbel\u00e1ez, P., Sorkine-Hornung, A., and Van Gool, L. The 2017 DAVIS challenge on video object segmentation. arXiv preprint arXiv:1704.00675 (2017)."},{"key":"e_1_3_2_1_44_1","first-page":"606","article-title":"Efficiently scaling transformer inference","volume":"5","author":"Pope R.","year":"2023","unstructured":"Pope, R., Douglas, S., Chowdhery, A., Devlin, J., Bradbury, J., Heek, J., Xiao, K., Agrawal, S., and Dean, J. Efficiently scaling transformer inference. Proceedings of Machine Learning and Systems 5 (2023), 606\u2013624.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_45_1","first-page":"12116","article-title":"Do vision transformers see like convolutional neural networks?","volume":"34","author":"Raghu M.","year":"2021","unstructured":"Raghu, M., Unterthiner, T., Kornblith, S., Zhang, C., and Dosovitskiy, A. Do vision transformers see like convolutional neural networks? Advances in Neural Information Processing Systems 34 (2021), 12116\u201312128.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_46_1","first-page":"8831","volume-title":"Proceedings of the International Conference on Machine Learning","author":"Ramesh A.","year":"2021","unstructured":"Ramesh, A., Pavlov, M., Goh, G., Gray, S., Voss, C., Radford, A., Chen, M., and Sutskever, I. Zero-shot text-to-image generation. In Proceedings of the International Conference on Machine Learning (2021), pp. 8821\u20138831."},{"key":"e_1_3_2_1_47_1","volume-title":"-J. DynamicViT: Efficient vision transformers with dynamic token sparsification. Advances in Neural Information Processing Systems","author":"Rao Y.","year":"2021","unstructured":"Rao, Y., Zhao, W., Liu, B., Lu, J., Zhou, J., and Hsieh, C.-J. DynamicViT: Efficient vision transformers with dynamic token sparsification. Advances in Neural Information Processing Systems (2021), 13937\u201313949."},{"key":"e_1_3_2_1_48_1","first-page":"5577","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Rojas-Gomez R. A.","year":"2024","unstructured":"Rojas-Gomez, R. A., Lim, T.-Y., Do, M. N., and Yeh, R. A. Making vision transformers truly shift-equivariant. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2024), pp. 5568\u20135577."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"e_1_3_2_1_50_1","first-page":"1964","volume-title":"Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision","author":"Sarkar S.","year":"2025","unstructured":"Sarkar, S., Datta, G., Kundu, S., Zheng, K., Bhattacharyya, C., and Beerel, P. A. MaskVD: Region masking for efficient video object detection. In Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision (2025), pp. 1955\u20131964."},{"key":"e_1_3_2_1_51_1","volume-title":"The missing point in vision transformers for universal image segmentation. arXiv preprint arXiv:2505.19795","author":"Shahabodini S.","year":"2025","unstructured":"Shahabodini, S., Mansoori, M., Bayatmakou, F., Abouei, J., Plataniotis, K. N., and Mohammadi, A. The missing point in vision transformers for universal image segmentation. arXiv preprint arXiv:2505.19795 (2025)."},{"key":"e_1_3_2_1_52_1","first-page":"22867","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","author":"Shang Y.","year":"2025","unstructured":"Shang, Y., Cai, M., Xu, B., Lee, Y. J., and Yan, Y. LLaVa-PruMerge: Adaptive token reduction for efficient large multimodal models. In Proceedings of the IEEE\/CVF International Conference on Computer Vision (2025), pp. 22857\u201322867."},{"key":"e_1_3_2_1_53_1","volume-title":"arXiv preprint arXiv:2508.10104","author":"Sim\u00e9oni O.","year":"2025","unstructured":"Sim\u00e9oni, O., Vo, H. V., Seitzer, M., Baldassarre, F., Oqab, M., Jose, C., Khalidov, V., Szafraniec, M., Yi, S., Ramamonjisoa, M., et al. Dinov3. arXiv preprint arXiv:2508.10104 (2025)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2012.2221191"},{"key":"e_1_3_2_1_55_1","volume-title":"Attention is all you need. Advances in Neural Information Processing Systems","author":"Vaswani A.","year":"2017","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., Kaiser, \u0141., and Polosukhin, I. Attention is all you need. Advances in Neural Information Processing Systems (2017)."},{"key":"e_1_3_2_1_56_1","volume-title":"One-peace: Exploring one general representation model toward unlimited modalities. arXiv preprint arXiv:2305.11172","author":"Wang P.","year":"2023","unstructured":"Wang, P., Wang, S., Lin, J., Bai, S., Zhou, X., Zhou, J., Wang, X., and Zhou, C. One-peace: Exploring one general representation model toward unlimited modalities. arXiv preprint arXiv:2305.11172 (2023)."},{"key":"e_1_3_2_1_57_1","first-page":"18","article-title":"CST-ViT: Cascaded spatio-temporal redundancy elimination for efficient vision transformers on edge IoT devices","volume":"12","author":"Wang Q.","year":"2025","unstructured":"Wang, Q., Zou, X., Li, C., Peng, Y., Wang, H., Wen, Y., Yeerlan, M., and Chen, C. CST-ViT: Cascaded spatio-temporal redundancy elimination for efficient vision transformers on edge IoT devices. IEEE Internet of Things Journal 12, 18 (2025), 38857\u201338871.","journal-title":"IEEE Internet of Things Journal"},{"key":"e_1_3_2_1_58_1","volume-title":"Linformer: Self-attention with linear complexity. arXiv preprint arXiv:2006.04768","author":"Wang S.","year":"2020","unstructured":"Wang, S., Li, B. Z., Khabsa, M., Fang, H., and Ma, H. Linformer: Self-attention with linear complexity. arXiv preprint arXiv:2006.04768 (2020)."},{"key":"e_1_3_2_1_59_1","first-page":"16142","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Woo S.","year":"2023","unstructured":"Woo, S., Debnath, S., Hu, R., Chen, X., Liu, Z., Kweon, I. S., and Xie, S. ConvNeXt v2: Co-designing and scaling ConvNets with masked autoencoders. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2023), pp. 16133\u201316142."},{"key":"e_1_3_2_1_60_1","first-page":"31","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","author":"Wu H.","year":"2021","unstructured":"Wu, H., Xiao, B., Codella, N., Liu, M., Dai, X., Yuan, L., and Zhang, L. CvT: Introducing convolutions to vision transformers. In Proceedings of the IEEE\/CVF International Conference on Computer Vision (2021), pp. 22\u201331."},{"key":"e_1_3_2_1_61_1","first-page":"2739","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","volume":"36","author":"Wu S.","year":"2022","unstructured":"Wu, S., Wu, T., Tan, H., and Guo, G. Pale transformer: A general vision transformer backbone with pale-shaped attention. In Proceedings of the AAAI Conference on Artificial Intelligence (2022), vol. 36, pp. 2731\u20132739."},{"key":"e_1_3_2_1_62_1","volume-title":"Detectron2. https:\/\/github.com\/facebookresearch\/detectron2","author":"Wu Y.","year":"2019","unstructured":"Wu, Y., Kirillov, A., Massa, F., Lo, W.-Y., and Girshick, R. Detectron2. https:\/\/github.com\/facebookresearch\/detectron2, 2019."},{"key":"e_1_3_2_1_63_1","first-page":"3","article-title":"Image registration techniques: An overview. International Journal of Signal Processing","volume":"2","author":"Wyawahare M. V.","year":"2009","unstructured":"Wyawahare, M. V., Patil, P. M., Abhyankar, H. K., et al. Image registration techniques: An overview. International Journal of Signal Processing, Image Processing and Pattern Recognition 2, 3 (2009), 11\u201328.","journal-title":"Image Processing and Pattern Recognition"},{"key":"e_1_3_2_1_64_1","first-page":"1897","volume-title":"Proceedings of the IEEE INFOCOM","author":"Xiao X.","year":"2022","unstructured":"Xiao, X., Zhang, J., Wang, W., He, J., and Zhang, Q. DNN-driven compressive offloading for edge-assisted semantic video segmentation. In Proceedings of the IEEE INFOCOM (2022), pp. 1888\u20131897."},{"key":"e_1_3_2_1_65_1","first-page":"16","volume-title":"Proceedings of the ACM MobiCom","author":"Xie X.","year":"2019","unstructured":"Xie, X., and Kim, K.-H. Source compression with bounded DNN perception loss for IoT edge computer vision. In Proceedings of the ACM MobiCom (2019), pp. 1\u201316."},{"key":"e_1_3_2_1_66_1","first-page":"6386","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","volume":"38","author":"Xu Z.","year":"2024","unstructured":"Xu, Z., Wu, D., Yu, C., Chu, X., Sang, N., and Gao, C. SCTNet: Single-branch CNN with transformer semantic information for real-time segmentation. In Proceedings of the AAAI Conference on Artificial Intelligence (2024), vol. 38, pp. 6378\u20136386."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00583"},{"key":"e_1_3_2_1_68_1","first-page":"1329","volume-title":"Proceedings of the ACM MobiCom","author":"Yang K.","year":"2024","unstructured":"Yang, K., Jeong, M., Yi, J., Lee, J., Park, K., and Lee, Y. Logan: Loss-tolerant live video analytics system. In Proceedings of the ACM MobiCom (2024), pp. 1314\u20131329."},{"key":"e_1_3_2_1_69_1","first-page":"10511","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Ye L.","year":"2019","unstructured":"Ye, L., Rochan, M., Liu, Z., and Wang, Y. Cross-modal self-attention network for referring image segmentation. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2019), pp. 10502\u201310511."},{"key":"e_1_3_2_1_70_1","first-page":"10","volume-title":"Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision","author":"Yeom S.-K.","year":"2025","unstructured":"Yeom, S.-K., and Von Klitzing, J. U-MixFormer: UNet-like transformer with mix-attention for efficient semantic segmentation. In Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision (2025), IEEE, pp. 1\u201310."},{"key":"e_1_3_2_1_71_1","first-page":"10818","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Yin H.","year":"2022","unstructured":"Yin, H., Vahdat, A., Alvarez, J. M., Mallya, A., Kautz, J., and Molchanov, P. A-ViT: Adaptive tokens for efficient vision transformer. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2022), pp. 10809\u201310818."},{"key":"e_1_3_2_1_72_1","first-page":"10","volume-title":"Proceedings of the IEEE INFOCOM","author":"Yuan T.","year":"2023","unstructured":"Yuan, T., Mi, L., Wang, W., Dai, H., and Fu, X. AccDecoder: Accelerated decoding for neural-enhanced video analytics. In Proceedings of the IEEE INFOCOM (2023), pp. 1\u201310."},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.544"},{"key":"e_1_3_2_1_74_1","first-page":"2","article-title":"Accelerating deep learning inference via model parallelism and partial computation offloading","volume":"34","author":"Zhou H.","year":"2022","unstructured":"Zhou, H., Li, M., Wang, N., Min, G., and Wu, J. Accelerating deep learning inference via model parallelism and partial computation offloading. IEEE Transactions on Parallel and Distributed Systems 34, 2 (2022), 475\u2013488.","journal-title":"IEEE Transactions on Parallel and Distributed Systems"},{"key":"e_1_3_2_1_75_1","first-page":"1200","volume-title":"Proceedings of the IEEE INFOCOM","author":"Zhu A.","year":"2024","unstructured":"Zhu, A., Zhang, S., Shi, X., Cheng, K., Sun, H., and Lu, S. Crucio: End-to-end coordinated spatio-temporal redundancy elimination for fast video analytics. In Proceedings of the IEEE INFOCOM (2024), pp. 1191\u20131200."}],"event":{"name":"MobiSys '26: 24th Annual International Conference on Mobile Systems, Applications and Services","location":"University of Cambridge Cambridge United Kingdom","acronym":"MobiSys '26","sponsor":["SIGMOBILE ACM Special Interest Group on Mobility of Systems, Users, Data and Computing","SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 24th Annual International Conference on Mobile Systems, Applications and Services"],"original-title":[],"deposited":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T13:01:52Z","timestamp":1780059712000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3745756.3809212"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,20]]},"references-count":75,"alternative-id":["10.1145\/3745756.3809212","10.1145\/3745756"],"URL":"https:\/\/doi.org\/10.1145\/3745756.3809212","relation":{},"subject":[],"published":{"date-parts":[[2026,6,20]]},"assertion":[{"value":"2026-06-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}