{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T00:15:24Z","timestamp":1777421724744,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","funder":[{"name":"Natural Sciences and Engineering Research Council of Canada (NSERC) CREATE","award":["584767- 2024"],"award-info":[{"award-number":["584767- 2024"]}]},{"name":"European Regional Development Fund (ERDF), Innovation and Digital Transition Programme (COMPETE 2030), Portugal 2030","award":["Project CDMS"],"award-info":[{"award-number":["Project CDMS"]}]},{"name":"European Regional Development Fund (ERDF), Innovation and Digital Transition Programme (COMPETE 2030), Portugal 2030","award":["reference 17409 (COMPETE2030-FEDER-01193000)"],"award-info":[{"award-number":["reference 17409 (COMPETE2030-FEDER-01193000)"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,27]]},"DOI":"10.1145\/3767295.3769376","type":"proceedings-article","created":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T20:20:04Z","timestamp":1777062004000},"page":"2037-2053","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["MinatoLoader: Accelerating Machine Learning Training Through Efficient Data Preprocessing"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-6965-2871","authenticated-orcid":false,"given":"Rahma","family":"Nouaji","sequence":"first","affiliation":[{"name":"McGill University, Montreal, Quebec, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3723-6581","authenticated-orcid":false,"given":"Stella","family":"Bitchebe","sequence":"additional","affiliation":[{"name":"McGill University, Montreal, Quebec, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4036-0126","authenticated-orcid":false,"given":"Ricardo","family":"Macedo","sequence":"additional","affiliation":[{"name":"INESC TEC &amp; U.Minho, Braga, Portugal"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6822-8891","authenticated-orcid":false,"given":"Oana","family":"Balmau","sequence":"additional","affiliation":[{"name":"McGill University, Montreal, Quebec, Canada"}]}],"member":"320","published-online":{"date-parts":[[2026,4,26]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"d.]. KiTS19 Challenge Dataset. https:\/\/kits19.grand-challenge.org\/data\/. Accessed: [Jan 12","year":"2025","unstructured":"[n. d.]. KiTS19 Challenge Dataset. https:\/\/kits19.grand-challenge.org\/data\/. Accessed: [Jan 12, 2025]."},{"key":"e_1_3_2_1_2_1","volume-title":"d.]. MLPerf Training Benchmark Suite V3.1 Results. https:\/\/mlcommons.org\/benchmarks\/training\/. Accessed: [May 5","year":"2025","unstructured":"[n. d.]. MLPerf Training Benchmark Suite V3.1 Results. https:\/\/mlcommons.org\/benchmarks\/training\/. Accessed: [May 5, 2025]."},{"key":"e_1_3_2_1_3_1","volume-title":"Accessed: [May 05","author":"NumPy","year":"2025","unstructured":"[n. d.]. NumPy - The fundamental Package for scientific computing with Python. https:\/\/numpy.org\/. Accessed: [May 05, 2025]."},{"key":"e_1_3_2_1_4_1","volume-title":"d.]. NVIDIA Data Loading Library (DALI). https:\/\/developer.nvidia.com\/dali. Accessed: [May 5","year":"2025","unstructured":"[n. d.]. NVIDIA Data Loading Library (DALI). https:\/\/developer.nvidia.com\/dali. Accessed: [May 5, 2025]."},{"key":"e_1_3_2_1_5_1","volume-title":"Accessed: [May 05","year":"2025","unstructured":"[n. d.]. Pandas: Powerful Python Data Analysis Toolkit. https:\/\/pypi.org\/project\/pandas\/. Accessed: [May 05, 2025]."},{"key":"e_1_3_2_1_6_1","volume-title":"d.]. Scikit-learn - Machine Learning in Python. https:\/\/scikit-learn.org\/stable\/. Accessed: [May 05","year":"2025","unstructured":"[n. d.]. Scikit-learn - Machine Learning in Python. https:\/\/scikit-learn.org\/stable\/. Accessed: [May 05, 2025]."},{"key":"e_1_3_2_1_7_1","volume-title":"NVIDIA A100 Tensor Core GPU. https:\/\/www.nvidia.com\/en-us\/data-center\/a100\/ Accessed","year":"2025","unstructured":"2020. NVIDIA A100 Tensor Core GPU. https:\/\/www.nvidia.com\/en-us\/data-center\/a100\/ Accessed: May 11, 2025."},{"key":"e_1_3_2_1_8_1","volume-title":"https:\/\/developer.nvidia.com\/management-library-nvml Accessed","author":"Managem NVIDIA","year":"2025","unstructured":"2020. NVIDIA Managem.net Library (NVML). https:\/\/developer.nvidia.com\/management-library-nvml Accessed: May 11, 2025."},{"key":"e_1_3_2_1_9_1","volume-title":"https:\/\/pytorch.org\/tutorials\/beginner\/basics\/data_tutorial.html. Accessed","author":"DataLoaders Datasets","year":"2025","unstructured":"2025. Datasets & DataLoaders. https:\/\/pytorch.org\/tutorials\/beginner\/basics\/data_tutorial.html. Accessed: May 14, 2025."},{"key":"e_1_3_2_1_10_1","volume-title":"dstat - Versatile Tool for Generating System Resource Metrics. https:\/\/linux.die.net\/man\/1\/dstat Accessed","year":"2025","unstructured":"2025. dstat - Versatile Tool for Generating System Resource Metrics. https:\/\/linux.die.net\/man\/1\/dstat Accessed: May 14, 2025."},{"key":"e_1_3_2_1_11_1","volume-title":"Accessed","year":"2025","unstructured":"2025. Global Interpreter Lock. https:\/\/pybind11.readthedocs.io\/en\/stable\/advanced\/misc.html. Accessed: May 14, 2025."},{"key":"e_1_3_2_1_12_1","unstructured":"2025. Spark Streaming Programming Guide. https:\/\/spark.apache.org\/docs\/latest\/streaming-programming-guide.html."},{"key":"e_1_3_2_1_13_1","volume-title":"IEEE International Symposium on Workload Characterization. IEEE, 30\u201343","author":"Bachkaniwala Rajveer","year":"2024","unstructured":"Rajveer Bachkaniwala, Harshith Lanka, Kexin Rong, and Ada Gavrilovska. 2024. Lotus: Characterization of Machine Learning Preprocessing Pipelines via Framework and Hardware Profiling. In IEEE International Symposium on Workload Characterization. IEEE, 30\u201343. 10.1109\/IISWC63097.2024.00013"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3572751.3572765"},{"key":"e_1_3_2_1_15_1","volume-title":"19th International Conference on Medical Image Computing and Computer-Assisted Intervention","volume":"9901","author":"\u00c7i\u00e7ek \u00d6zg\u00fcn","year":"2016","unstructured":"\u00d6zg\u00fcn \u00c7i\u00e7ek, Ahmed Abdulkadir, Soeren S. Lienkamp, Thomas Brox, and Olaf Ronneberger. 2016. 3D U-Net: Learning Dense Volumetric Segmentation from Sparse Annotation. In 19th International Conference on Medical Image Computing and Computer-Assisted Intervention, Vol. 9901. 424\u2013432. 10.1007\/978-3-319-46723-8_49"},{"key":"e_1_3_2_1_16_1","volume-title":"An Empirical Study on Low GPU Utilization of Deep Learning Jobs. In 46th IEEE\/ACM International Conference on Software Engineering. ACM, 96:1\u201396:13","author":"Gao Yanjie","year":"2024","unstructured":"Yanjie Gao, Yichen He, Xinze Li, Bo Zhao, Haoxiang Lin, Yoyo Liang, Jing Zhong, Hongyu Zhang, Jingzhou Wang, Yonghua Zeng, Keli Gui, Jie Tong, and Mao Yang. 2024. An Empirical Study on Low GPU Utilization of Deep Learning Jobs. In 46th IEEE\/ACM International Conference on Software Engineering. ACM, 96:1\u201396:13. 10.1145\/3597503.3639232"},{"key":"e_1_3_2_1_17_1","volume-title":"2022 USENIX Annual Technical Conference. USENIX Association, 689\u2013706","author":"Graur Dan","year":"2022","unstructured":"Dan Graur, Damien Aymon, Dan Kluser, Tanguy Albrici, Chandramohan A. Thekkath, and Ana Klimovic. 2022. Cachew: Machine Learning Input Data Processing as a Service. In 2022 USENIX Annual Technical Conference. USENIX Association, 689\u2013706. https:\/\/www.usenix.org\/conference\/atc22\/presentation\/graur"},{"key":"e_1_3_2_1_18_1","volume-title":"Pecan: Cost-Efficient ML Data Preprocessing with Automatic Transformation Ordering and Hybrid Placement. In 2024 USENIX Annual Technical Conference. USENIX Association, 649\u2013665","author":"Graur Dan","year":"2024","unstructured":"Dan Graur, Oto Mraz, Muyu Li, Mohammad Sepehr Pourghannad, Chandramohan A. Thekkath, and Ana Klimovic. 2024. Pecan: Cost-Efficient ML Data Preprocessing with Automatic Transformation Ordering and Hybrid Placement. In 2024 USENIX Annual Technical Conference. USENIX Association, 649\u2013665. https:\/\/www.usenix.org\/conference\/atc24\/presentation\/graur"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2022.3163226"},{"key":"e_1_3_2_1_20_1","unstructured":"Nicholas Heller Niranjan Sathianathen Arveen Kalapara Edward Walczak Keenan Moore Heather Kaluzniak Joel Rosenberg Paul Blake Zachary Rengel Makinna Oestreich Joshua Dean Michael Tradewell Aneri Shah Resha Tejpaul Zachary Edgerton Matthew Peterson Shaneabbas Raza Subodh Regmi Nikolaos Papanikolopoulos and Christopher Weight. 2020. The KiTS19 Challenge Data: 300 Kidney Tumor Cases with Clinical Context CT Semantic Segmentations and Surgical Outcomes. arXiv:1904.00445 https:\/\/arxiv.org\/abs\/1904.00445"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.14778\/3636218.3636238"},{"key":"e_1_3_2_1_22_1","unstructured":"Sotiris Kotsiantis Dimitris Kanellopoulos and P. Pintelas. 2006. Data Preprocessing for Supervised Learning. International Journal of Computer Science (2006)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.14778\/3476249.3476308"},{"key":"e_1_3_2_1_24_1","volume-title":"Refurbish Your Training Data: Reusing Partially Augmented Samples for Faster Deep Neural Network Training. In 2021 USENIX Annual Technical Conference. USENIX Association, 537\u2013550","author":"Lee Gyewon","year":"2021","unstructured":"Gyewon Lee, Irene Lee, Hyeonmin Ha, Kyung-Geun Lee, Hwarim Hyun, Ahnjae Shin, and Byung-Gon Chun. 2021. Refurbish Your Training Data: Reusing Partially Augmented Samples for Faster Deep Neural Network Training. In 2021 USENIX Annual Technical Conference. USENIX Association, 537\u2013550. https:\/\/www.usenix.org\/conference\/atc21\/presentation\/lee"},{"key":"e_1_3_2_1_25_1","volume-title":"PreSto: An In-Storage Data Preprocessing System for Training Recommendation Models. In 51st ACM\/IEEE Annual International Symposium on Computer Architecture. IEEE, 340\u2013353","author":"Lee Yunjae","year":"2024","unstructured":"Yunjae Lee, Hyeseong Kim, and Minsoo Rhu. 2024. PreSto: An In-Storage Data Preprocessing System for Training Recommendation Models. In 51st ACM\/IEEE Annual International Symposium on Computer Architecture. IEEE, 340\u2013353. 10.1109\/ISCA59077.2024.00033"},{"key":"e_1_3_2_1_26_1","volume-title":"Proceedings of the 3rd International Conference on Computer, Artificial Intelligence and Control Engineering.","author":"Liang Zijing","year":"2024","unstructured":"Zijing Liang, Yanjie Xu, Yifan Hong, Penghui Shang, Qi Wang, Qiang Fu, and Ke Liu. 2024. A survey of multimodel large language models. In Proceedings of the 3rd International Conference on Computer, Artificial Intelligence and Control Engineering."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_28_1","volume-title":"Christian Vecchiola, and Rahil Garnavi.","author":"Maetschke Stefan","year":"2017","unstructured":"Stefan Maetschke, Ruwan Bandara Tennakoon, Christian Vecchiola, and Rahil Garnavi. 2017. nuts-flow\/ml: data pre-processing for deep learning. (2017). arXiv:1708.06046 http:\/\/arxiv.org\/abs\/1708.06046"},{"key":"e_1_3_2_1_29_1","volume-title":"Recurrent Neural Network Transducer for Audio-Visual Speech Recognition. In IEEE Automatic Speech Recognition and Understanding Workshop. IEEE, 905\u2013912","author":"Makino Takaki","year":"2019","unstructured":"Takaki Makino, Hank Liao, Yannis M. Assael, Brendan Shillingford, Basilio Garcia, Otavio Braga, and Olivier Siohan. 2019. Recurrent Neural Network Transducer for Audio-Visual Speech Recognition. In IEEE Automatic Speech Recognition and Understanding Workshop. IEEE, 905\u2013912. 10.1109\/ASRU46091.2019.9004036"},{"key":"e_1_3_2_1_30_1","volume-title":"Accessed","author":"Massa Francisco","year":"2025","unstructured":"Francisco Massa and Ross Girshick. [n. d.]. maskrnn-benchmark: Fast, modular reference implementation of Instance Segmentation and Object Detection algorithms in PyTorch. https:\/\/github.com\/facebookresearch\/maskrcnn-benchmark. Accessed: May 11, 2025."},{"key":"e_1_3_2_1_31_1","volume-title":"Proceedings of Machine Learning and Systems","author":"Mattson Peter","year":"2020","unstructured":"Peter Mattson, Christine Cheng, Gregory Diamos, Cody Coleman, Paulius Micikevicius, David Patterson, Hanlin Tang, Gu-Yeon Wei, Peter Bailis, Victor Bittorf, et al. 2020. MlPerf Training Benchmark. Proceedings of Machine Learning and Systems (2020)."},{"key":"e_1_3_2_1_32_1","volume-title":"Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems","author":"Mazumder Mark","year":"2023","unstructured":"Mark Mazumder, Colby Banbury, Xiaozhe Yao, Bojan Karla\u0161, William Gaviria Rojas, Sudnya Diamos, Greg Diamos, Lynn He, Alicia Parrish, Hannah Rose Kirk, et al. 2023. Dataperf: Benchmarks for data-centric AI development. In Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023."},{"key":"e_1_3_2_1_33_1","volume-title":"d.]. MLPerf Benchmarking Suite - PyTorch implementation for image segmentation. https:\/\/github.com\/mlcommons\/training\/tree\/master\/image_segmentation\/pytorch. Accessed: [May 5","year":"2025","unstructured":"MLCommons. [n. d.]. MLPerf Benchmarking Suite - PyTorch implementation for image segmentation. https:\/\/github.com\/mlcommons\/training\/tree\/master\/image_segmentation\/pytorch. Accessed: [May 5, 2025]."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.14778\/3446095.3446100"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.14778\/3476311.3476374"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","unstructured":"Rahma Nouaji and Stella Bitchebe. 2025. Rahm-no\/MinatoLoader: MinatoLoader v1.0.1. 10.5281\/zenodo.17201356","DOI":"10.5281\/zenodo.17201356"},{"key":"e_1_3_2_1_37_1","volume-title":"Proceedings of the 4th Workshop on Machine Learning and Systems. ACM, 65\u201372","author":"Nouaji Rahma","year":"2024","unstructured":"Rahma Nouaji, Stella Bitchebe, and Oana Balmau. 2024. SpeedyLoader: Efficient Pipelining of Data Preprocessing and Machine Learning Training. In Proceedings of the 4th Workshop on Machine Learning and Systems. ACM, 65\u201372. 10.1145\/3642970.3655824"},{"key":"e_1_3_2_1_38_1","volume-title":"2015 IEEE International Conference on Acoustics, Speech and Signal Processing. IEEE, 5206\u20135210","author":"Panayotov Vassil","year":"2015","unstructured":"Vassil Panayotov, Guoguo Chen, Daniel Povey, and Sanjeev Khudanpur. 2015. Librispeech: An ASR corpus based on public domain audio books. In 2015 IEEE International Conference on Acoustics, Speech and Signal Processing. IEEE, 5206\u20135210. 10.1109\/ICASSP.2015.7178964"},{"key":"e_1_3_2_1_39_1","volume-title":"53rd Annual IEEE\/ACM International Symposium on Microarchitecture. IEEE, 825\u2013838","author":"Park Pyeongsu","year":"2020","unstructured":"Pyeongsu Park, Heetaek Jeong, and Jangwoo Kim. 2020. TrainBox: An Extreme-Scale Neural Network Training Server Architecture by Systematically Balancing Operations. In 53rd Annual IEEE\/ACM International Symposium on Microarchitecture. IEEE, 825\u2013838. 10.1109\/MICRO50266.2020.00072"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.2980942"},{"key":"e_1_3_2_1_41_1","volume-title":"Conference on Human Factors in Computing Systems. ACM, 39:1\u201339:15","author":"Sambasivan Nithya","year":"2021","unstructured":"Nithya Sambasivan, Shivani Kapania, Hannah Highfill, Diana Akrong, Praveen K. Paritosh, and Lora Aroyo. 2021. \"Everyone wants to do the model work, not the data work\": Data Cascades in High-Stakes AI. In Conference on Human Factors in Computing Systems. ACM, 39:1\u201339:15. 10.1145\/3411764.3445518"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1186\/S40537-019-0197-0"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1007\/S11263-022-01611-X"},{"key":"e_1_3_2_1_44_1","unstructured":"Apache Spark. 2019. Spark."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.14778\/3579075.3579083"},{"key":"e_1_3_2_1_46_1","volume-title":"The 49th Annual International Symposium on Computer Architecture. ACM, 1042\u20131057","author":"Zhao Mark","year":"2022","unstructured":"Mark Zhao, Niket Agarwal, Aarti Basant, Bugra Gedik, Satadru Pan, Mustafa Ozdal, Rakesh Komuravelli, Jerry Pan, Tianshu Bao, Haowei Lu, Sundaram Narayanan, Jack Langman, Kevin Wilfong, Harsha Rastogi, Carole-Jean Wu, Christos Kozyrakis, and Parik Pol. 2022. Understanding data storage and ingestion for large-scale deep recommendation model training: industrial product. In The 49th Annual International Symposium on Computer Architecture. ACM, 1042\u20131057. 10.1145\/3470496.3533044"},{"key":"e_1_3_2_1_47_1","unstructured":"Mark Zhao Dhruv Choudhary Devashish Tyagi Ajay Somani Max Kaplan Sung-Han Lin Sarunya Pumma Jongsoo Park Aarti Basant Niket Agarwal Carole-Jean Wu and Christos Kozyrakis. 2023. RecD: Deduplication for End-to-End Deep Learning Recommendation Model Training Infrastructure. arXiv:2211.05239 https:\/\/arxiv.org\/abs\/2211.05239"},{"key":"e_1_3_2_1_48_1","volume-title":"Tectonic-Shift: A Composite Storage Fabric for Large-Scale ML Training. In 2023 USENIX Annual Technical Conference. USENIX Association, 433\u2013449","author":"Zhao Mark","year":"2023","unstructured":"Mark Zhao, Satadru Pan, Niket Agarwal, Zhaoduo Wen, David Xu, Anand Natarajan, Pavan Kumar, Shiva Shankar P., Ritesh Tijoriwala, Karan Asher, Hao Wu, Aarti Basant, Daniel Ford, Delia David, Nezih Yigitbasi, Pratap Singh, and Carole-Jean Wu. 2023. Tectonic-Shift: A Composite Storage Fabric for Large-Scale ML Training. In 2023 USENIX Annual Technical Conference. USENIX Association, 433\u2013449."}],"event":{"name":"EUROSYS '26: 21st European Conference on Computer Systems","location":"McEwan Hall\/The University of Edinburgh Edinburgh Scotland UK","acronym":"EUROSYS '26","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 21st European Conference on Computer Systems"],"original-title":[],"deposited":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T20:33:51Z","timestamp":1777062831000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3767295.3769376"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,26]]},"references-count":48,"alternative-id":["10.1145\/3767295.3769376","10.1145\/3767295"],"URL":"https:\/\/doi.org\/10.1145\/3767295.3769376","relation":{},"subject":[],"published":{"date-parts":[[2026,4,26]]},"assertion":[{"value":"2026-04-26","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}