{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,8]],"date-time":"2026-04-08T08:56:03Z","timestamp":1775638563731,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":73,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,22]]},"DOI":"10.1145\/3722212.3724430","type":"proceedings-article","created":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T09:00:26Z","timestamp":1750150826000},"page":"404-417","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["A\n            <scp>uto<\/scp>\n            C\n            <scp>omp<\/scp>\n            : Automated Data Compaction for Log-Structured Tables in Data Lakes"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-2547-8610","authenticated-orcid":false,"given":"Anja","family":"Gruenheid","sequence":"first","affiliation":[{"name":"Microsoft, Zurich, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-9151-6024","authenticated-orcid":false,"given":"Jes\u00fas","family":"Camacho-Rodr\u00edguez","sequence":"additional","affiliation":[{"name":"Microsoft, Mountain View, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3712-7358","authenticated-orcid":false,"given":"Carlo","family":"Curino","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, WA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-5086-7664","authenticated-orcid":false,"given":"Raghu","family":"Ramakrishnan","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, WA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-3106-4263","authenticated-orcid":false,"given":"Stanislav","family":"Pak","sequence":"additional","affiliation":[{"name":"LinkedIn, Sunnyvale, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-4677-5885","authenticated-orcid":false,"given":"Sumedh","family":"Sakdeo","sequence":"additional","affiliation":[{"name":"LinkedIn, Sunnyvale, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-4914-4067","authenticated-orcid":false,"given":"Lenisha","family":"Gandhi","sequence":"additional","affiliation":[{"name":"LinkedIn, Sunnyvale, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-1591-6000","authenticated-orcid":false,"given":"Sandeep K.","family":"Singhal","sequence":"additional","affiliation":[{"name":"LinkedIn, Sunnyvale, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-1831-062X","authenticated-orcid":false,"given":"Pooja","family":"Nilangekar","sequence":"additional","affiliation":[{"name":"University of Maryland, College Park, MD, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3771-2995","authenticated-orcid":false,"given":"Daniel J.","family":"Abadi","sequence":"additional","affiliation":[{"name":"University of Maryland, College Park, MD, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,6,22]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Josep Aguilar-Saborit Raghu Ramakrishnan Kevin Bocksrocker Alan Halverson Konstantin Kosinsky Ryan O'Connor Nadejda Poliakova Moe Shafiei Taewoo Kim Phil Kon-Kim Haris Mahmud-Ansari Blazej Matuszyk Matt Miles Sumin Mohanan Cristian Petculescu Ishan Rahesh-Madan Emma Rose-Wirshing and Elias Yousefi. 2024. Extending Polaris to Support Transactions. In ACM SIGMOD.","DOI":"10.1145\/3626246.3653392"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Muhammad Yousuf Ahmad and Bettina Kemme. 2015. Compaction Management in Distributed Key-Value Datastores. In PVLDB.","DOI":"10.14778\/2757807.2757810"},{"key":"e_1_3_2_1_3_1","unstructured":"Amazon. 2025 a. AWS Pricing Calculator. https:\/\/calculator.aws."},{"key":"e_1_3_2_1_4_1","unstructured":"Amazon. 2025 b. S3 - Cloud Object Storage. https:\/\/aws.amazon.com\/s3\/."},{"key":"e_1_3_2_1_5_1","unstructured":"Apache Hudi. 2025 a. https:\/\/hudi.apache.org\/."},{"key":"e_1_3_2_1_6_1","unstructured":"Apache Hudi. 2025 b. Compaction. https:\/\/hudi.apache.org\/docs\/0.13.1\/compaction\/."},{"key":"e_1_3_2_1_7_1","unstructured":"Apache Iceberg. 2025 a. https:\/\/iceberg.apache.org\/."},{"key":"e_1_3_2_1_8_1","unstructured":"Apache Iceberg. 2025 b. Compaction. https:\/\/iceberg.apache.org\/docs\/1.4.3\/maintenance\/#compact-data-files."},{"key":"e_1_3_2_1_9_1","unstructured":"Apache Iceberg. 2025 c. Metadata Tables. https:\/\/iceberg.apache.org\/docs\/1.4.3\/spark-queries\/#all-metadata-tables."},{"key":"e_1_3_2_1_10_1","unstructured":"Apache Iceberg. 2025 d. Rewrite Data Files Parameters. https:\/\/iceberg.apache.org\/javadoc\/1.4.3\/org\/apache\/iceberg\/actions\/RewriteDataFiles.html."},{"key":"e_1_3_2_1_11_1","unstructured":"Apache Iceberg. 2025 e. Table Migration. https:\/\/iceberg.apache.org\/docs\/1.4.3\/spark-procedures\/#table-migration."},{"key":"e_1_3_2_1_12_1","unstructured":"Apache ORC. 2025. https:\/\/orc.apache.org\/."},{"key":"e_1_3_2_1_13_1","unstructured":"Apache Ozone. 2025. https:\/\/ozone.apache.org\/."},{"key":"e_1_3_2_1_14_1","unstructured":"Apache Paimon (incubating). 2025. https:\/\/paimon.apache.org\/."},{"key":"e_1_3_2_1_15_1","unstructured":"Apache Parquet. 2025. https:\/\/parquet.apache.org\/."},{"key":"e_1_3_2_1_16_1","unstructured":"Apache Polaris (incubating). 2025. https:\/\/polaris.apache.org\/."},{"key":"e_1_3_2_1_17_1","unstructured":"Apache Spark. 2025. Adaptive Query Execution. https:\/\/archive.apache.org\/dist\/spark\/docs\/3.1.1\/sql-performance-tuning.html#adaptive-query-execution."},{"key":"e_1_3_2_1_18_1","volume-title":"Adrian Ionescu, Alicja Luszczak, Michal Switakowski, Takuya Ueshin, Xiao Li, Michal Szafranski, Pieter Senster, and Matei Zaharia.","author":"Armbrust Michael","year":"2020","unstructured":"Michael Armbrust, Tathagata Das, Sameer Paranjpye, Reynold Xin, Shixiong Zhu, Ali Ghodsi, Burak Yavuz, Mukul Murthy, Joseph Torres, Liwen Sun, Peter A. Boncz, Mostafa Mokhtar, Herman Van Hovell, Adrian Ionescu, Alicja Luszczak, Michal Switakowski, Takuya Ueshin, Xiao Li, Michal Szafranski, Pieter Senster, and Matei Zaharia. 2020. Delta Lake: High-Performance ACID Table Storage over Cloud Object Stores. In PVLDB."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"Joy Arulraj Andrew Pavlo and Prashanth Menon. 2016. Bridging the Archipelago between Row-Stores and Column-Stores for Hybrid Workloads. In ACM SIGMOD.","DOI":"10.1145\/2882903.2915231"},{"key":"e_1_3_2_1_20_1","unstructured":"Jeff Barr. 2024. Amazon S3 Tables: Storage optimized for analytics workloads. https:\/\/aws.amazon.com\/blogs\/aws\/new-amazon-s3-tables-storage-optimized-for-analytics-workloads\/"},{"key":"e_1_3_2_1_21_1","unstructured":"Ryan Blue. 2023. Hello World of CDC! https:\/\/tabular.io\/blog\/hello-world-of-cdc\/"},{"key":"e_1_3_2_1_22_1","volume-title":"Carlo Curino, and Raghu Ramakrishnan","author":"Camacho-Rodr\u00edguez Jes\u00fas","year":"2024","unstructured":"Jes\u00fas Camacho-Rodr\u00edguez, Ashvin Agrawal, Anja Gruenheid, Ashit Gosalia, Cristian Petculescu, Josep Aguilar-Saborit, Avrilia Floratou, Carlo Curino, and Raghu Ramakrishnan. 2024. LST-Bench: Benchmarking Log-Structured Tables in the Cloud. In ACM SIGMOD."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3299869.3314045"},{"key":"e_1_3_2_1_24_1","unstructured":"Cloud Analytics Benchmark (CAB) Tool. 2025. https:\/\/github.com\/alexandervanrenen\/cab."},{"key":"e_1_3_2_1_25_1","unstructured":"Databricks. 2025 a. Announcing General Availability of Predictive Optimization. https:\/\/www.databricks.com\/blog\/announcing-general-availability-predictive-optimization."},{"key":"e_1_3_2_1_26_1","unstructured":"Databricks. 2025 b. Databricks Platform. https:\/\/www.databricks.com\/."},{"key":"e_1_3_2_1_27_1","unstructured":"Delta Lake. 2025 a. https:\/\/delta.io\/."},{"key":"e_1_3_2_1_28_1","unstructured":"Delta Lake. 2025 b. Compaction. https:\/\/docs.delta.io\/2.4.0\/optimizations-oss.html#compaction-bin-packing."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/2463676.2463710"},{"key":"e_1_3_2_1_30_1","unstructured":"Google Cloud. 2025 a. Cloud Storage. https:\/\/cloud.google.com\/storage."},{"key":"e_1_3_2_1_31_1","unstructured":"Google Cloud. 2025 b. Google Cloud Pricing Calculator. https:\/\/cloud.google.com\/products\/calculator."},{"key":"e_1_3_2_1_32_1","unstructured":"Avijit Goswami and Rajarshi Sarkar. 2023. Apache Iceberg optimization: Solving the small files problem in Amazon EMR. https:\/\/aws.amazon.com\/blogs\/big-data\/apache-iceberg-optimization-solving-the-small-files-problem-in-amazon-emr\/"},{"key":"e_1_3_2_1_33_1","volume-title":"Kolovson and Michael Stonebraker","author":"Curtis","year":"1989","unstructured":"Curtis P. Kolovson and Michael Stonebraker. 1989. Indexing Techniques for Historical Databases. In IEEE ICDE."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"crossref","unstructured":"Brian Kroth Sergiy Matusevych Rana Alotaibi Yiwen Zhu Anja Gruenheid and Yuanyuan Tian. 2024. MLOS in Action: Bridging the Gap Between Experimentation and Auto-Tuning in the Cloud. In PVLDB.","DOI":"10.14778\/3685800.3685852"},{"key":"e_1_3_2_1_35_1","unstructured":"Alexey Kudinkin and Tao Meng. 2021. Hudi Z-Order and Hilbert Space Filling Curves. https:\/\/hudi.apache.org\/blog\/2021\/12\/29\/hudi-zorder-and-hilbert-space-filling-curves\/"},{"key":"e_1_3_2_1_36_1","unstructured":"Zihan Li Sudarshan Vasudevan Lei Sun and Shirshanka Das. 2021. FastIngest: Low-latency Gobblin with Apache Iceberg and ORC format. https:\/\/www.linkedin.com\/blog\/engineering\/open-source\/fastingest-low-latency-gobblin"},{"key":"e_1_3_2_1_37_1","unstructured":"LinkedIn. 2025. OpenHouse. https:\/\/github.com\/linkedin\/openhouse\/."},{"key":"e_1_3_2_1_38_1","unstructured":"LST-Bench. 2025 a. https:\/\/github.com\/microsoft\/lst-bench."},{"key":"e_1_3_2_1_39_1","unstructured":"LST-Bench. 2025 b. Cloud Analytics Benchmark (CAB) Integration. https:\/\/github.com\/microsoft\/lst-bench\/issues\/335."},{"key":"e_1_3_2_1_40_1","unstructured":"Alex Merced. 2022. Compaction in Apache Iceberg: Fine-Tuning Your Iceberg Table's Data Files. https:\/\/www.dremio.com\/blog\/compaction-in-apache-iceberg-fine-tuning-your-iceberg-tables-data-files\/"},{"key":"e_1_3_2_1_41_1","unstructured":"Microsoft. 2025 a. Auto compaction for Delta Lake on Azure Databricks. https:\/\/learn.microsoft.com\/en-us\/azure\/databricks\/delta\/tune-file-size#--auto-compaction-for-delta-lake-on-azure-databricks."},{"key":"e_1_3_2_1_42_1","unstructured":"Microsoft. 2025 b. Azure Data Lake Storage. https:\/\/azure.microsoft.com\/products\/storage\/data-lake-storage."},{"key":"e_1_3_2_1_43_1","unstructured":"Microsoft. 2025 c. Azure Pricing Calculator. https:\/\/azure.microsoft.com\/pricing\/calculator\/."},{"key":"e_1_3_2_1_44_1","unstructured":"Microsoft. 2025 d. Delta Lake table optimization and V-Order. https:\/\/learn.microsoft.com\/fabric\/data-engineering\/delta-optimization-and-v-order."},{"key":"e_1_3_2_1_45_1","unstructured":"Microsoft. 2025 e. Log Analytics in Azure Monitor. https:\/\/learn.microsoft.com\/azure\/azure-monitor\/logs\/log-analytics-overview."},{"key":"e_1_3_2_1_46_1","unstructured":"Microsoft. 2025 f. Microsoft Fabric. https:\/\/www.microsoft.com\/microsoft-fabric."},{"key":"e_1_3_2_1_47_1","unstructured":"Microsoft. 2025 g. The need for optimize write on Apache Spark. https:\/\/learn.microsoft.com\/azure\/synapse-analytics\/spark\/optimize-write-for-apache-spark."},{"key":"e_1_3_2_1_48_1","unstructured":"MLOS. 2025. https:\/\/github.com\/microsoft\/MLOS."},{"key":"e_1_3_2_1_49_1","unstructured":"Raghunath Othayoth Nambiar and Meikel Poess. 2006. The Making of TPC-DS. In PVLDB."},{"key":"e_1_3_2_1_50_1","volume-title":"Narasayya and Manoj Syamala","author":"Vivek","year":"2010","unstructured":"Vivek R. Narasayya and Manoj Syamala. 2010. Workload driven index defragmentation. In IEEE ICDE."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","unstructured":"Anton Okolnychyi Chao Sun Kazuyuki Tanimura Russell Spitzer Ryan Blue Szehon Ho Yufei Gu Vishwanath Lakkundi and D. B. Tsai. 2024. Petabyte-Scale Row-Level Operations in Data Lakehouses. In PVLDB.","DOI":"10.14778\/3685800.3685834"},{"key":"e_1_3_2_1_52_1","unstructured":"Christopher Olston Greg Chiou Laukik Chitnis Francis Liu Yiping Han Mattias Larsson Andreas Neumann Vellanki B. N. Rao Vijayanand Sankarasubramanian Siddharth Seth Chao Tian Topher ZiCornell and Xiaodan Wang. 2011. Nova: continuous Pig\/Hadoop workflows. In ACM SIGMOD."},{"key":"e_1_3_2_1_53_1","unstructured":"OpenHouse. 2025. Azure deployment using Terraform scripts. https:\/\/github.com\/linkedin\/openhouse\/tree\/main\/infra\/recipes\/terraform\/azure."},{"key":"e_1_3_2_1_54_1","unstructured":"Matthew Powers. 2023a. Delta Lake - Small File Compaction with OPTIMIZE. https:\/\/delta.io\/blog\/2023-01--25-delta-lake-small-file-compaction-optimize\/"},{"key":"e_1_3_2_1_55_1","unstructured":"Matthew Powers. 2023b. Delta Lake - Z Order. https:\/\/delta.io\/blog\/2023-06-03-delta-lake-z-order\/"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"crossref","unstructured":"Raghu Ramakrishnan Baskar Sridharan John R. Douceur Pavan Kasturi Balaji Krishnamachari-Sampath Karthick Krishnamoorthy Peng Li Mitica Manu Spiro Michaylov Rog\u00e9rio Ramos Neil Sharman Zee Xu Youssef Barakat Chris Douglas Richard Draves Shrikant S. Naidu Shankar Shastry Atul Sikaria Simon Sun and Ramarathnam Venkatesan. 2017. Azure Data Lake Store: A Hyperscale Distributed File Service for Big Data Analytics. In ACM SIGMOD.","DOI":"10.1145\/3035918.3056100"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"crossref","unstructured":"Subhadeep Sarkar Dimitris Staratzis Zichen Zhu and Manos Athanassoulis. 2021. Constructing and Analyzing the LSM Compaction Design Space. In PVLDB.","DOI":"10.14778\/3476249.3476274"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"crossref","unstructured":"Russell Sears and Raghu Ramakrishnan. 2012. bLSM: a general purpose log structured merge tree. In ACM SIGMOD.","DOI":"10.1145\/2213836.2213862"},{"key":"e_1_3_2_1_59_1","unstructured":"Russell Sears and Catharine van Ingen. 2007. Fragmentation in Large Object Repositories. In CIDR."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/320473.320484"},{"key":"e_1_3_2_1_61_1","unstructured":"Konstantin V. Shvachko Chen Liang and Simbarashe Dzinamarira. 2021. The exabyte club: LinkedIn's journey of scaling the Hadoop Distributed File System. https:\/\/www.linkedin.com\/blog\/engineering\/open-source\/the-exabyte-club-linkedin-s-journey-of-scaling-the-hadoop-distr"},{"key":"e_1_3_2_1_62_1","volume-title":"Thomas Peh, and Christof Bornh\u00f6vd.","author":"Sikka Vishal","year":"2012","unstructured":"Vishal Sikka, Franz F\u00e4rber, Wolfgang Lehner, Sang Kyun Cha, Thomas Peh, and Christof Bornh\u00f6vd. 2012. Efficient transaction processing in SAP HANA database: the end of a column store myth. In ACM SIGMOD."},{"key":"e_1_3_2_1_63_1","unstructured":"Snowflake. 2023. The Snowflake Platform. https:\/\/www.snowflake.com\/."},{"key":"e_1_3_2_1_64_1","unstructured":"S\u00e9bastien Stormacq. 2023. AWS Glue Data Catalog now supports automatic compaction of Apache Iceberg tables. https:\/\/aws.amazon.com\/blogs\/aws\/aws-glue-data-catalog-now-supports-automatic-compaction-of-apache-iceberg-tables\/"},{"key":"e_1_3_2_1_65_1","unstructured":"Anupom Syam. 2023. Optimizing data warehouse storage. https:\/\/netflixtechblog.com\/optimizing-data-warehouse-storage-7b94a48fdcbe"},{"key":"e_1_3_2_1_66_1","unstructured":"TPC. 2021. TPC-DS Specification Version 3.2.0. https:\/\/www.tpc.org\/tpc_documents_current_versions\/pdf\/tpc-ds_v3.2.0.pdf."},{"key":"e_1_3_2_1_67_1","unstructured":"Unity Catalog. 2025. https:\/\/www.unitycatalog.io\/."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"crossref","unstructured":"Alexander van Renen and Viktor Leis. 2023. Cloud Analytics Benchmark. In PVLDB.","DOI":"10.14778\/3583140.3583156"},{"key":"e_1_3_2_1_69_1","unstructured":"Jack Vanlightly. 2024a. Understanding Apache Hudi's Consistency Model. https:\/\/jack-vanlightly.com\/analyses\/2024\/4\/24\/understanding-apache-hudi-consistency-model-part-1"},{"key":"e_1_3_2_1_70_1","unstructured":"Jack Vanlightly. 2024b. Understanding Apache Iceberg's Consistency Model. https:\/\/jack-vanlightly.com\/analyses\/2024\/7\/30\/understanding-apache-icebergs-consistency-model-part1"},{"key":"e_1_3_2_1_71_1","unstructured":"Jack Vanlightly. 2024c. Understanding Delta Lake's consistency model. https:\/\/jack-vanlightly.com\/analyses\/2024\/4\/29\/understanding-delta-lakes-consistency-model"},{"key":"e_1_3_2_1_72_1","unstructured":"Midhul Vuppalapati Justin Miron Rachit Agarwal Dan Truong Ashish Motivala and Thierry Cruanes. 2020. Building An Elastic Query Engine on Disaggregated Storage. In USENIX NSDI."},{"key":"e_1_3_2_1_73_1","volume-title":"FLAML: A Fast and Lightweight AutoML Library. In MLSys.","author":"Wang Chi","year":"2021","unstructured":"Chi Wang, Qingyun Wu, Markus Weimer, and Erkang Zhu. 2021. FLAML: A Fast and Lightweight AutoML Library. In MLSys."}],"event":{"name":"SIGMOD\/PODS '25: International Conference on Management of Data","location":"Berlin Germany","acronym":"SIGMOD\/PODS '25","sponsor":["SIGMOD ACM Special Interest Group on Management of Data"]},"container-title":["Companion of the 2025 International Conference on Management of Data"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3722212.3724430","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,10]],"date-time":"2025-09-10T22:38:03Z","timestamp":1757543883000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3722212.3724430"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,22]]},"references-count":73,"alternative-id":["10.1145\/3722212.3724430","10.1145\/3722212"],"URL":"https:\/\/doi.org\/10.1145\/3722212.3724430","relation":{},"subject":[],"published":{"date-parts":[[2025,6,22]]},"assertion":[{"value":"2025-06-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}