{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T11:38:50Z","timestamp":1777462730358,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":47,"publisher":"ACM","funder":[{"name":"European Commission","award":["101120726"],"award-info":[{"award-number":["101120726"]}]},{"name":"European Commission","award":["101175702"],"award-info":[{"award-number":["101175702"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,27]]},"DOI":"10.1145\/3805621.3807620","type":"proceedings-article","created":{"date-parts":[[2026,4,28]],"date-time":"2026-04-28T13:08:45Z","timestamp":1777381725000},"page":"118-126","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["STEER: Software Toolkit for Edge Efficient Retraining"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-8221-754X","authenticated-orcid":false,"given":"Konstantina","family":"Orfanou","sequence":"first","affiliation":[{"name":"Institute of Computer Science, Foundation for Research and Technology Hellas, Heraklion, Greece"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3566-1610","authenticated-orcid":false,"given":"Christos","family":"Kozanitis","sequence":"additional","affiliation":[{"name":"Institute of Computer Science, Foundation for Research and Technology Hellas, Heraklion, Greece"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,4,28]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2025. jtop. https:\/\/rnext.it\/jetson_stats\/reference\/jtop.htrnl#jtop.jtop.memory"},{"key":"e_1_3_2_1_2_1","unstructured":"2025. torch.profiler. https:\/\/pytorch.org\/docs\/main\/profiler.html."},{"key":"e_1_3_2_1_3_1","unstructured":"2025. Training a Classifier. https:\/\/docs.pytorch.org\/tutorials\/beginner\/blitz\/cifar10_tutorial.html"},{"key":"e_1_3_2_1_4_1","unstructured":"2025. Understanding CUDA Memory Usage. https:\/\/pytorch.org\/docs\/main\/torch_cuda_memory.html."},{"key":"e_1_3_2_1_5_1","unstructured":"2026. Jetson Orin Nano Super Developer Kit. https:\/\/www.nvidia.com\/en-us\/autonomous-machines\/embedded-systems\/jetson-orin\/nano-super-developer-kit\/"},{"key":"e_1_3_2_1_6_1","unstructured":"2026. jetson-stats 7.1.5-Jtop. https:\/\/rnext.it\/jetson_stats\/reference\/jtop.html#jtop.jtop.power"},{"key":"e_1_3_2_1_7_1","unstructured":"2026. time \u2014 Time access and conversions. https:\/\/docs.python.org\/3\/library\/time.html."},{"key":"e_1_3_2_1_8_1","unstructured":"2026. timeit \u2014 Measure execution time of small code snippets. https:\/\/docs.python.org\/3\/library\/timeit.html."},{"key":"e_1_3_2_1_9_1","unstructured":"2026. Ultralytics YOLOv5. https:\/\/docs.ultralytics.com\/models\/yolov5\/#performance-metrics."},{"key":"e_1_3_2_1_10_1","unstructured":"baijumeswani. 2024. [Training] Retraining a YOLO V8n model on device. https:\/\/github.com\/microsoft\/onnxruntime\/issues\/20201"},{"key":"e_1_3_2_1_11_1","volume-title":"Proceedings of Machine Learning and Systems, P. Gibbons, G. Pekhimenko, and C. De Sa (Eds.)","volume":"6","author":"Bian Song","year":"2024","unstructured":"Song Bian, Dacheng Li, Hongyi Wang, Eric Xing, and Shivaram Venkataraman. 2024. Does Compressing Activations Help Model Parallel Training?. In Proceedings of Machine Learning and Systems, P. Gibbons, G. Pekhimenko, and C. De Sa (Eds.), Vol. 6. 239\u2013252. https:\/\/proceedings.mlsys.org\/paper_files\/paper\/2024\/file\/71381211d0abef73ed1887b83c4547b1-Paper-Conference.pdf"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3736548.3737829"},{"key":"e_1_3_2_1_13_1","volume-title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. CoRR abs\/2010.11929","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2020. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. CoRR abs\/2010.11929 (2020). https:\/\/arxiv.org\/abs\/2010.11929"},{"key":"e_1_3_2_1_14_1","unstructured":"Elias Frantar and Dan Alistarh. 2023. QMoE: Practical Sub-1-Bit Compression of Trillion-Parameter Models. arXiv:2310.16795 [cs.LG] https:\/\/arxiv.org\/abs\/2310.16795"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613157"},{"key":"e_1_3_2_1_16_1","volume-title":"LoRA: Low-Rank Adaptation of Large Language Models. CoRR abs\/2106.09685","author":"Hu Edward J.","year":"2021","unstructured":"Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, and Weizhu Chen. 2021. LoRA: Low-Rank Adaptation of Large Language Models. CoRR abs\/2106.09685 (2021). arXiv:2106.09685 https:\/\/arxiv.org\/abs\/2106.09685"},{"key":"e_1_3_2_1_17_1","volume-title":"Kingma and Jimmy Ba","author":"Diederik","year":"2017","unstructured":"Diederik P. Kingma and Jimmy Ba. 2017. Adam: A Method for Stochastic Optimization. arXiv:1412.6980 [cs.LG] https:\/\/arxiv.org\/abs\/1412 6980"},{"key":"e_1_3_2_1_18_1","volume-title":"On-Device Training Under 256KB Memory. ArXiv abs\/2206.15472","author":"Lin Ji","year":"2022","unstructured":"Ji Lin, Ligeng Zhu, Wei-Ming Chen, Wei-Chen Wang, Chuang Gan, and Song Han. 2022. On-Device Training Under 256KB Memory. ArXiv abs\/2206.15472 (2022). https:\/\/api.semanticscholar.org\/CorpusID:250144436"},{"key":"e_1_3_2_1_19_1","unstructured":"NVIDIA. [n.d.]. Effective Usage of Unified Memory on Tegra. https:\/\/docs.nvidia.com\/cuda\/cuda-for-tegra-appnote\/index.html#memory-management"},{"key":"e_1_3_2_1_20_1","unstructured":"NVIDIA Corporation. 2010. NVIDIA\u00ae Tegra\u2122 Multi-Processor Architecture. Technical Report. NVIDIA Corporation. https:\/\/www.nvidia.com\/docs\/io\/90715\/tegra_multiprocessor_architecture_white_paper_final_v1.1.pdf"},{"key":"e_1_3_2_1_21_1","volume-title":"LISA: Layerwise Importance Sampling for Memory-Efficient Large Language Model Fine-Tuning. arXiv:2403.17919 [cs.LG] https:\/\/arxiv.org\/abs\/2403.17919","author":"Pan Rui","year":"2024","unstructured":"Rui Pan, Xiang Liu, Shizhe Diao, Renjie Pi, Jipeng Zhang, Chi Han, and Tong Zhang. 2024. LISA: Layerwise Importance Sampling for Memory-Efficient Large Language Model Fine-Tuning. arXiv:2403.17919 [cs.LG] https:\/\/arxiv.org\/abs\/2403.17919"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3552326.3587459"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3696083"},{"key":"e_1_3_2_1_24_1","unstructured":"PyTorch. [n.d.]. Models and pre-trained weights. https:\/\/docs.pytorch.org\/vision\/main\/models.html"},{"key":"e_1_3_2_1_25_1","volume-title":"22nd USENIX Conference on File and Storage Technologies (FAST 24)","author":"Qian Yingjin","year":"2024","unstructured":"Yingjin Qian, Marc-Andr\u00e9 Vef, Patrick Farrell, Andreas Dilger, Xi Li, Shuichi Ihara, Yinjin Fu, Wei Xue, and Andre Brinkmann. 2024. Combining Buffered I\/O and Direct I\/O in Distributed File Systems. In 22nd USENIX Conference on File and Storage Technologies (FAST 24). USENIX Association, Santa Clara, CA, 17\u201333. https:\/\/www.usenix.org\/conference\/fast24\/presentation\/qian"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613137"},{"key":"e_1_3_2_1_27_1","volume-title":"ZeRO: Memory Optimization Towards Training A Trillion Parameter Models. CoRR abs\/1910.02054","author":"Rajbhandari Samyam","year":"2019","unstructured":"Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, and Yuxiong He. 2019. ZeRO: Memory Optimization Towards Training A Trillion Parameter Models. CoRR abs\/1910.02054 (2019). arXiv:1910.02054http:\/\/arxiv.org\/abs\/1910.02054"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476205"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_2_1_30_1","volume-title":"Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, and Yuxiong He.","author":"Ren Jie","year":"2021","unstructured":"Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, and Yuxiong He. 2021. ZeRO-Offload: Democratizing Billion-Scale Model Training. CoRR abs\/2101.06840 (2021). arXiv:2101.06840 https:\/\/arxiv.org\/abs\/2101 06840"},{"key":"e_1_3_2_1_31_1","unstructured":"ONNX Runtime. [n.d.]. On-Device Training with ONNX Runtime. https:\/\/onnxruntime.ai\/docs\/get-started\/training-on-device.html"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"Dhananjay Saikumar and Blesson Varghese. 2024. NeuroFlux: Memory-Efficient CNN Training Using Adaptive Local Learning. arXiv:2402.14139 [cs.LG] https:\/\/arxiv.org\/abs\/2402.14139","DOI":"10.1145\/3627703.3650067"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3696067"},{"key":"e_1_3_2_1_34_1","unstructured":"F. Serzhenko. [n.d.]. Jetson Zero-Copy for Embedded Applications. https:\/\/www.fastcompression.com\/blog\/jetson-zero-copy.htm"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3717469"},{"key":"e_1_3_2_1_36_1","unstructured":"svekars. 2024. How to save memory by fusing the optimizer step into the backward pass. https:\/\/pytorch.org\/tutorials\/intermediate\/optimizer_step_in_backward_tutorial.html"},{"key":"e_1_3_2_1_37_1","unstructured":"Ultralytics. [n.d.]. COCO Dataset. https:\/\/docs.ultralytics.com\/datasets\/detect\/coco\/"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3552326.3567494"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3552326.3567505"},{"key":"e_1_3_2_1_40_1","volume-title":"Proceedings of the ACM Symposium on Cloud Computing. 305\u2013318","author":"Xanthakis Giorgos","year":"2021","unstructured":"Giorgos Xanthakis, Giorgos Saloustros, Nikos Batsaras, Anastasios Papagiannis, and Angelos Bilas. 2021. Parallax: Hybrid key-value placement in lsm-based key-value stores. In Proceedings of the ACM Symposium on Cloud Computing. 305\u2013318."},{"key":"e_1_3_2_1_41_1","volume-title":"Proceedings of the 2024 USENIX Conference on Usenix Annual Technical Conference (Santa Clara, CA, USA) (USENIX ATC'24). USENIX Association, USA, Article 36","author":"Xu Mengwei","year":"2024","unstructured":"Mengwei Xu, Dongqi Cai, Yaozong Wu, Xiang Li, and Shangguang Wang. 2024. FwdLLM: efficient federated finetuning of large language models with perturbed inferences. In Proceedings of the 2024 USENIX Conference on Usenix Annual Technical Conference (Santa Clara, CA, USA) (USENIX ATC'24). USENIX Association, USA, Article 36, 18 pages."},{"key":"e_1_3_2_1_42_1","volume-title":"Minjia Zhang, Xiaoxia Wu, Conglong Li, and Yuxiong He.","author":"Yao Zhewei","year":"2022","unstructured":"Zhewei Yao, Reza Yazdani Aminabadi, Minjia Zhang, Xiaoxia Wu, Conglong Li, and Yuxiong He. 2022. ZeroQuant: Efficient and Affordable Post-Training Quantization for Large-Scale Transformers. arXiv:2206.01861 [cs.CL] https:\/\/arxiv.org\/abs\/2206.01861"},{"key":"e_1_3_2_1_43_1","unstructured":"Sergey Zagoruyko and Nikos Komodakis. 2017. Wide Residual Networks. arXiv:1605.07146 [cs.CV] https:\/\/arxiv.org\/abs\/1605.07146"},{"key":"e_1_3_2_1_44_1","volume-title":"Proceedings of the 2025 USENIX Conference on Usenix Annual Technical Conference (Boston, MA, USA) (USENIX ATC '25). USENIX Association, USA, Article 99","author":"Zhan Shichen","year":"2025","unstructured":"Shichen Zhan, Li Li, and Chengzhong Xu. 2025. AssyLLM: efficient federated fine-tuning of LLMs via assembling pre-trained blocks. In Proceedings of the 2025 USENIX Conference on Usenix Annual Technical Conference (Boston, MA, USA) (USENIX ATC '25). USENIX Association, USA, Article 99, 15 pages."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3627703.3629586"},{"key":"e_1_3_2_1_46_1","volume-title":"Proceedings of Machine Learning and Systems 7","author":"Zhu Hanqing","year":"2025","unstructured":"Hanqing Zhu, Zhenyu Zhang, Wenyan Cong, Xi Liu, Sem Park, Vikas Chandra, Bo Long, David Z Pan, Zhangyang Wang, and Jinwon Lee. 2025. Apollo: Sgd-like memory, adamw-level performance. Proceedings of Machine Learning and Systems 7 (2025)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3696087"}],"event":{"name":"EuroSys '26: 21st European Conference on Computer Systems","location":"Edinburgh Scotland Uk","acronym":"EuroMLSys '26","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the Sixth European Workshop on Machine Learning and Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3805621.3807620","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,28]],"date-time":"2026-04-28T13:18:15Z","timestamp":1777382295000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805621.3807620"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,27]]},"references-count":47,"alternative-id":["10.1145\/3805621.3807620","10.1145\/3805621"],"URL":"https:\/\/doi.org\/10.1145\/3805621.3807620","relation":{},"subject":[],"published":{"date-parts":[[2026,4,27]]},"assertion":[{"value":"2026-04-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}