{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T12:58:40Z","timestamp":1776085120700,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":177,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,6,9]],"date-time":"2024-06-09T00:00:00Z","timestamp":1717891200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"Amazon Research Award"},{"name":"Cisco Research Award"},{"name":"Qualcomm Innovation Fellowship"},{"name":"NSF awards","award":["CNS-2147909, CNS-2211882, CNS-2239351"],"award-info":[{"award-number":["CNS-2147909, CNS-2211882, CNS-2239351"]}]},{"name":"Google Faculty Research Award"},{"name":"Samsung GRO Research Award"},{"name":"Oracle Research Award"},{"name":"Meta Research Award"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,6,9]]},"DOI":"10.1145\/3626246.3654683","type":"proceedings-article","created":{"date-parts":[[2024,5,23]],"date-time":"2024-05-23T10:26:39Z","timestamp":1716459999000},"page":"547-555","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":13,"title":["Demystifying Data Management for Large Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9371-8358","authenticated-orcid":false,"given":"Xupeng","family":"Miao","sequence":"first","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1270-5185","authenticated-orcid":false,"given":"Zhihao","family":"Jia","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1681-4677","authenticated-orcid":false,"given":"Bin","family":"Cui","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,6,9]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"ICLR 2023 Workshop on Mathematical and Empirical Understanding of Foundation Models.","author":"Mohamed Abbas Amro Kamal","year":"2023","unstructured":"Amro Kamal Mohamed Abbas, Kushal Tirumala, Daniel Simig, Surya Ganguli, and Ari S Morcos. 2023. SemDeDup: Data-efficient learning at web-scale through semantic deduplication. In ICLR 2023 Workshop on Mathematical and Empirical Understanding of Foundation Models."},{"key":"e_1_3_2_1_2_1","volume-title":"Jasper Schulze Buschhoff, et al","author":"Ali Mehdi","year":"2023","unstructured":"Mehdi Ali, Michael Fromm, Klaudia Thellmann, Richard Rutmann, Max L\u00fcbbering, Johannes Leveling, Katrin Klug, Jan Ebert, Niclas Doll, Jasper Schulze Buschhoff, et al. 2023. Tokenizer Choice For LLM Training: Negligible or Crucial? arXiv preprint arXiv:2310.08754 (2023)."},{"key":"e_1_3_2_1_3_1","volume-title":"Becoming self-instruct: introducing early stopping criteria for minimal instruct tuning. arXiv preprint arXiv:2307.03692","author":"AlShikh Waseem","year":"2023","unstructured":"Waseem AlShikh, Manhal Daaboul, Kirk Goddard, Brock Imel, Kiran Kamble, Parikshith Kulkarni, and Melisa Russak. 2023. Becoming self-instruct: introducing early stopping criteria for minimal instruct tuning. arXiv preprint arXiv:2307.03692 (2023)."},{"key":"e_1_3_2_1_4_1","volume-title":"Dynamic Context Pruning for Efficient and Interpretable Autoregressive Transformers. arXiv preprint arXiv:2305.15805","author":"Anagnostidis Sotiris","year":"2023","unstructured":"Sotiris Anagnostidis, Dario Pavllo, Luca Biggio, Lorenzo Noci, Aurelien Lucchi, and Thomas Hoffmann. 2023. Dynamic Context Pruning for Efficient and Interpretable Autoregressive Transformers. arXiv preprint arXiv:2305.15805 (2023)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.14778\/3626292.3626294"},{"key":"e_1_3_2_1_6_1","volume-title":"NLU on Data Diets: Dynamic Data Subset Selection for NLP Classification Tasks. arXiv preprint arXiv:2306.03208","author":"Attendu Jean-Michel","year":"2023","unstructured":"Jean-Michel Attendu and Jean-Philippe Corbeil. 2023. NLU on Data Diets: Dynamic Data Subset Selection for NLP Classification Tasks. arXiv preprint arXiv:2306.03208 (2023)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.14778\/3554821.3554890"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.14778\/3574245.3574260"},{"key":"e_1_3_2_1_9_1","volume-title":"Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020","author":"Brown Tom B.","year":"2020","unstructured":"Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel M. Ziegler, Jeffrey Wu, Clemens Winter, Christopher Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020. Language Models are Few-Shot Learners. In Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6--12, 2020, virtual, Hugo Larochelle, Marc'Aurelio Ranzato, Raia Hadsell, Maria-Florina Balcan, and Hsuan-Tien Lin (Eds.). https:\/\/proceedings.neurips.cc\/paper\/2020\/hash\/1457c0d6bfcb4967418bfb8ac142f64a-Abstract.html"},{"key":"e_1_3_2_1_10_1","volume-title":"Instruction mining: High-quality instruction data selection for large language models. arXiv preprint arXiv:2307.06290","author":"Cao Yihan","year":"2023","unstructured":"Yihan Cao, Yanbin Kang, and Lichao Sun. 2023. Instruction mining: High-quality instruction data selection for large language models. arXiv preprint arXiv:2307.06290 (2023)."},{"key":"e_1_3_2_1_11_1","volume-title":"30th USENIX Security Symposium (USENIX Security 21)","author":"Carlini Nicholas","year":"2021","unstructured":"Nicholas Carlini, Florian Tramer, Eric Wallace, Matthew Jagielski, Ariel Herbert-Voss, Katherine Lee, Adam Roberts, Tom Brown, Dawn Song, Ulfar Erlingsson, et al. 2021. Extracting training data from large language models. In 30th USENIX Security Symposium (USENIX Security 21). 2633--2650."},{"key":"e_1_3_2_1_12_1","volume-title":"Demystifying Artificial Intelligence for Data Preparation. In Companion of the 2023 International Conference on Management of Data. 13--20","author":"Chai Chengliang","year":"2023","unstructured":"Chengliang Chai, Nan Tang, Ju Fan, and Yuyu Luo. 2023. Demystifying Artificial Intelligence for Data Preparation. In Companion of the 2023 International Conference on Management of Data. 13--20."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1017\/S1351324922000213"},{"key":"e_1_3_2_1_14_1","volume-title":"2023 a. Data-juicer: A one-stop data processing system for large language models. arXiv preprint arXiv:2309.02033","author":"Chen Daoyuan","year":"2023","unstructured":"Daoyuan Chen, Yilun Huang, Zhijian Ma, Hesen Chen, Xuchen Pan, Ce Ge, Dawei Gao, Yuexiang Xie, Zhaoyang Liu, Jinyang Gao, et al. 2023 a. Data-juicer: A one-stop data processing system for large language models. arXiv preprint arXiv:2309.02033 (2023)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2305.09246"},{"key":"e_1_3_2_1_16_1","volume-title":"2023 b. Alpagasus: Training a better alpaca with fewer data. arXiv preprint arXiv:2307.08701","author":"Chen Lichang","year":"2023","unstructured":"Lichang Chen, Shiyang Li, Jun Yan, Hai Wang, Kalpa Gunaratna, Vikas Yadav, Zheng Tang, Vijay Srinivasan, Tianyi Zhou, Heng Huang, et al. 2023 b. Alpagasus: Training a better alpaca with fewer data. arXiv preprint arXiv:2307.08701 (2023)."},{"key":"e_1_3_2_1_17_1","volume-title":"2023 c. Punica: Multi-Tenant LoRA Serving. arXiv preprint arXiv:2310.18547","author":"Chen Lequn","year":"2023","unstructured":"Lequn Chen, Zihao Ye, Yongji Wu, Danyang Zhuo, Luis Ceze, and Arvind Krishnamurthy. 2023 c. Punica: Multi-Tenant LoRA Serving. arXiv preprint arXiv:2310.18547 (2023)."},{"key":"e_1_3_2_1_18_1","volume-title":"Adapting Language Models to Compress Contexts. arXiv preprint arXiv:2305.14788","author":"Chevalier Alexis","year":"2023","unstructured":"Alexis Chevalier, Alexander Wettig, Anirudh Ajith, and Danqi Chen. 2023. Adapting Language Models to Compress Contexts. arXiv preprint arXiv:2305.14788 (2023)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.14778\/3007263.3007320"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/2882903.2912574"},{"key":"e_1_3_2_1_21_1","unstructured":"Hyung Won Chung Le Hou Shayne Longpre Barret Zoph Yi Tay William Fedus Yunxuan Li Xuezhi Wang Mostafa Dehghani Siddhartha Brahma et al. 2022. Scaling instruction-finetuned language models. arXiv preprint arXiv:2210.11416 (2022)."},{"key":"e_1_3_2_1_22_1","volume-title":"Chatlaw: Open-source legal large language model with integrated external knowledge bases. arXiv preprint arXiv:2306.16092","author":"Cui Jiaxi","year":"2023","unstructured":"Jiaxi Cui, Zongjian Li, Yang Yan, Bohua Chen, and Li Yuan. 2023. Chatlaw: Open-source legal large language model with integrated external knowledge bases. arXiv preprint arXiv:2306.16092 (2023)."},{"key":"e_1_3_2_1_23_1","volume-title":"8-bit Matrix Multiplication for Transformers at Scale. arXiv preprint arXiv:2208.07339","author":"Dettmers Tim","year":"2022","unstructured":"Tim Dettmers, Mike Lewis, Younes Belkada, and Luke Zettlemoyer. 2022. LLM.int8(): 8-bit Matrix Multiplication for Transformers at Scale. arXiv preprint arXiv:2208.07339 (2022)."},{"key":"e_1_3_2_1_24_1","volume-title":"SpQR: A Sparse-Quantized Representation for Near-Lossless LLM Weight Compression. arXiv preprint arXiv:2306.03078","author":"Dettmers Tim","year":"2023","unstructured":"Tim Dettmers, Ruslan Svirschevski, Vage Egiazarian, Denis Kuznedelev, Elias Frantar, Saleh Ashkboos, Alexander Borzunov, Torsten Hoefler, and Dan Alistarh. 2023. SpQR: A Sparse-Quantized Representation for Near-Lossless LLM Weight Compression. arXiv preprint arXiv:2306.03078 (2023)."},{"key":"e_1_3_2_1_25_1","volume-title":"Is gpt-3 a good data annotator? arXiv preprint arXiv:2212.10450","author":"Ding Bosheng","year":"2022","unstructured":"Bosheng Ding, Chengwei Qin, Linlin Liu, Lidong Bing, Shafiq Joty, and Boyang Li. 2022. Is gpt-3 a good data annotator? arXiv preprint arXiv:2212.10450 (2022)."},{"key":"e_1_3_2_1_26_1","volume-title":"2023 b. Longnet: Scaling transformers to 1,000,000,000 tokens. arXiv preprint arXiv:2307.02486","author":"Ding Jiayu","year":"2023","unstructured":"Jiayu Ding, Shuming Ma, Li Dong, Xingxing Zhang, Shaohan Huang, Wenhui Wang, and Furu Wei. 2023 b. Longnet: Scaling transformers to 1,000,000,000 tokens. arXiv preprint arXiv:2307.02486 (2023)."},{"key":"e_1_3_2_1_27_1","volume-title":"2023 a. Enhancing Chat Language Models by Scaling High-quality Instructional Conversations. arXiv preprint arXiv:2305.14233","author":"Ding Ning","year":"2023","unstructured":"Ning Ding, Yulin Chen, Bokai Xu, Yujia Qin, Zhi Zheng, Shengding Hu, Zhiyuan Liu, Maosong Sun, and Bowen Zhou. 2023 a. Enhancing Chat Language Models by Scaling High-quality Instructional Conversations. arXiv preprint arXiv:2305.14233 (2023)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.98"},{"key":"e_1_3_2_1_29_1","volume-title":"How abilities in large language models are affected by supervised fine-tuning data composition. arXiv preprint arXiv:2310.05492","author":"Dong Guanting","year":"2023","unstructured":"Guanting Dong, Hongyi Yuan, Keming Lu, Chengpeng Li, Mingfeng Xue, Dayiheng Liu, Wei Wang, Zheng Yuan, Chang Zhou, and Jingren Zhou. 2023. How abilities in large language models are affected by supervised fine-tuning data composition. arXiv preprint arXiv:2310.05492 (2023)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3318464.3383127"},{"key":"e_1_3_2_1_31_1","volume-title":"MoDS: Model-oriented Data Selection for Instruction Tuning. arXiv preprint arXiv:2311.15653","author":"Du Qianlong","year":"2023","unstructured":"Qianlong Du, Chengqing Zong, and Jiajun Zhang. 2023. MoDS: Model-oriented Data Selection for Instruction Tuning. arXiv preprint arXiv:2311.15653 (2023)."},{"key":"e_1_3_2_1_32_1","volume-title":"Who's Harry Potter? Approximate Unlearning in LLMs. arXiv preprint arXiv:2310.02238","author":"Eldan Ronen","year":"2023","unstructured":"Ronen Eldan and Mark Russinovich. 2023. Who's Harry Potter? Approximate Unlearning in LLMs. arXiv preprint arXiv:2310.02238 (2023)."},{"key":"e_1_3_2_1_33_1","volume-title":"DOGE: Domain Reweighting with Generalization Estimation. In Second Agent Learning in Open-Endedness Workshop.","author":"Fan Simin","year":"2023","unstructured":"Simin Fan, Matteo Pagliardini, and Martin Jaggi. 2023 b. DOGE: Domain Reweighting with Generalization Estimation. In Second Agent Learning in Open-Endedness Workshop."},{"key":"e_1_3_2_1_34_1","volume-title":"2023 a. Fate-llm: A industrial grade federated learning framework for large language models. arXiv preprint arXiv:2310.10049","author":"Fan Tao","year":"2023","unstructured":"Tao Fan, Yan Kang, Guoqiang Ma, Weijing Chen, Wenbin Wei, Lixin Fan, and Qiang Yang. 2023 a. Fate-llm: A industrial grade federated learning framework for large language models. arXiv preprint arXiv:2310.10049 (2023)."},{"key":"e_1_3_2_1_35_1","volume-title":"Yuhan Liu, and Yulia Tsvetkov.","author":"Feng Shangbin","year":"2023","unstructured":"Shangbin Feng, Chan Young Park, Yuhan Liu, and Yulia Tsvetkov. 2023. From Pretraining Data to Language Models to Downstream Tasks: Tracking the Trails of Political Biases Leading to Unfair NLP Models. arXiv preprint arXiv:2305.08283 (2023)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-acl.84"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.14778\/3611479.3611527"},{"key":"e_1_3_2_1_38_1","volume-title":"Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323","author":"Frantar Elias","year":"2022","unstructured":"Elias Frantar, Saleh Ashkboos, Torsten Hoefler, and Dan Alistarh. 2022a. Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323 (2022)."},{"key":"e_1_3_2_1_39_1","volume-title":"The Eleventh International Conference on Learning Representations.","author":"Frantar Elias","year":"2022","unstructured":"Elias Frantar, Saleh Ashkboos, Torsten Hoefler, and Dan Alistarh. 2022b. OPTQ: Accurate quantization for generative pre-trained transformers. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_1_40_1","volume-title":"Specializing Smaller Language Models towards Multi-Step Reasoning. arXiv preprint arXiv:2301.12726","author":"Fu Yao","year":"2023","unstructured":"Yao Fu, Hao Peng, Litu Ou, Ashish Sabharwal, and Tushar Khot. 2023. Specializing Smaller Language Models towards Multi-Step Reasoning. arXiv preprint arXiv:2301.12726 (2023)."},{"key":"e_1_3_2_1_41_1","unstructured":"Deep Ganguli Amanda Askell Nicholas Schiefer Thomas Liao Kamil.e Lukovs i=ut.e Anna Chen Anna Goldie Azalia Mirhoseini Catherine Olsson Danny Hernandez et al. 2023. The capacity for moral self-correction in large language models. arXiv preprint arXiv:2302.07459 (2023)."},{"key":"e_1_3_2_1_42_1","volume-title":"An empirical exploration in quality filtering of text data. arXiv preprint arXiv:2109.00698","author":"Gao Leo","year":"2021","unstructured":"Leo Gao. 2021. An empirical exploration in quality filtering of text data. arXiv preprint arXiv:2109.00698 (2021)."},{"key":"e_1_3_2_1_43_1","volume-title":"In-context autoencoder for context compression in a large language model. arXiv preprint arXiv:2307.06945","author":"Ge Tao","year":"2023","unstructured":"Tao Ge, Jing Hu, Xun Wang, Si-Qing Chen, and Furu Wei. 2023. In-context autoencoder for context compression in a large language model. arXiv preprint arXiv:2307.06945 (2023)."},{"key":"e_1_3_2_1_44_1","volume-title":"Semantic Compression With Large Language Models. arXiv preprint arXiv:2304.12512","author":"Gilbert Henry","year":"2023","unstructured":"Henry Gilbert, Michael Sandborn, Douglas C Schmidt, Jesse Spencer-Smith, and Jules White. 2023. Semantic Compression With Large Language Models. arXiv preprint arXiv:2304.12512 (2023)."},{"key":"e_1_3_2_1_45_1","volume-title":"Allie Del Giorno, Sivakanth Gopi, Mojan Javaheripi, Piero Kauffmann, Gustavo de Rosa, Olli Saarikivi, et al.","author":"Gunasekar Suriya","year":"2023","unstructured":"Suriya Gunasekar, Yi Zhang, Jyoti Aneja, Caio C\u00e9sar Teodoro Mendes, Allie Del Giorno, Sivakanth Gopi, Mojan Javaheripi, Piero Kauffmann, Gustavo de Rosa, Olli Saarikivi, et al. 2023. Textbooks Are All You Need. arXiv preprint arXiv:2306.11644 (2023)."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00778-021-00664-7"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.165"},{"key":"e_1_3_2_1_48_1","unstructured":"Danny Hernandez Tom Brown Tom Conerly Nova DasSarma Dawn Drain Sheer El-Showk Nelson Elhage Zac Hatfield-Dodds Tom Henighan Tristan Hume et al. 2022. Scaling laws and interpretability of learning from repeated data. arXiv preprint arXiv:2205.10487 (2022)."},{"key":"e_1_3_2_1_49_1","volume-title":"Lisa Anne Hendricks, Johannes Welbl, Aidan Clark, et al.","author":"Hoffmann Jordan","year":"2022","unstructured":"Jordan Hoffmann, Sebastian Borgeaud, Arthur Mensch, Elena Buchatskaya, Trevor Cai, Eliza Rutherford, Diego de Las Casas, Lisa Anne Hendricks, Johannes Welbl, Aidan Clark, et al. 2022a. Training compute-optimal large language models. arXiv preprint arXiv:2203.15556 (2022)."},{"key":"e_1_3_2_1_50_1","first-page":"30016","article-title":"An empirical analysis of compute-optimal large language model training","volume":"35","author":"Hoffmann Jordan","year":"2022","unstructured":"Jordan Hoffmann, Sebastian Borgeaud, Arthur Mensch, Elena Buchatskaya, Trevor Cai, Eliza Rutherford, Diego de Las Casas, Lisa Anne Hendricks, Johannes Welbl, Aidan Clark, et al. 2022b. An empirical analysis of compute-optimal large language model training. Advances in Neural Information Processing Systems , Vol. 35 (2022), 30016--30030.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_51_1","volume-title":"International Conference on Machine Learning. PMLR, 2790--2799","author":"Houlsby Neil","year":"2019","unstructured":"Neil Houlsby, Andrei Giurgiu, Stanislaw Jastrzebski, Bruna Morrone, Quentin De Laroussilhe, Andrea Gesmundo, Mona Attariyan, and Sylvain Gelly. 2019. Parameter-efficient transfer learning for NLP. In International Conference on Machine Learning. PMLR, 2790--2799."},{"key":"e_1_3_2_1_52_1","volume-title":"Distilling step-by-step! outperforming larger language models with less training data and smaller model sizes. arXiv preprint arXiv:2305.02301","author":"Hsieh Cheng-Yu","year":"2023","unstructured":"Cheng-Yu Hsieh, Chun-Liang Li, Chih-Kuan Yeh, Hootan Nakhost, Yasuhisa Fujii, Alexander Ratner, Ranjay Krishna, Chen-Yu Lee, and Tomas Pfister. 2023. Distilling step-by-step! outperforming larger language models with less training data and smaller model sizes. arXiv preprint arXiv:2305.02301 (2023)."},{"key":"e_1_3_2_1_53_1","volume-title":"LoRA: Low-Rank Adaptation of Large Language Models. In International Conference on Learning Representations.","author":"Hu Edward J","year":"2021","unstructured":"Edward J Hu, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen, et al. 2021. LoRA: Low-Rank Adaptation of Large Language Models. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3187009.3177734"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3506712"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3514221.3526049"},{"key":"e_1_3_2_1_57_1","volume-title":"LLM Platform Security: Applying a Systematic Evaluation Framework to OpenAI's ChatGPT Plugins. arXiv preprint arXiv:2309.10254","author":"Iqbal Umar","year":"2023","unstructured":"Umar Iqbal, Tadayoshi Kohno, and Franziska Roesner. 2023. LLM Platform Security: Applying a Systematic Evaluation Framework to OpenAI's ChatGPT Plugins. arXiv preprint arXiv:2309.10254 (2023)."},{"key":"e_1_3_2_1_58_1","volume-title":"GPT-Zip: Deep Compression of Finetuned Large Language Models. In Workshop on Efficient Systems for Foundation Models@ ICML2023","author":"Isik Berivan","year":"2023","unstructured":"Berivan Isik, Hermann Kumbong, Wanyi Ning, Xiaozhe Yao, Sanmi Koyejo, and Ce Zhang. 2023. GPT-Zip: Deep Compression of Finetuned Large Language Models. In Workshop on Efficient Systems for Foundation Models@ ICML2023."},{"key":"e_1_3_2_1_59_1","volume-title":"Data-Efficient Finetuning Using Cross-Task Nearest Neighbors. arXiv preprint arXiv:2212.00196","author":"Ivison Hamish","year":"2022","unstructured":"Hamish Ivison, Noah A Smith, Hannaneh Hajishirzi, and Pradeep Dasigi. 2022. Data-Efficient Finetuning Using Cross-Task Nearest Neighbors. arXiv preprint arXiv:2212.00196 (2022)."},{"key":"e_1_3_2_1_60_1","volume-title":"LLM-Assisted Code Cleaning For Training Accurate Code Generators. arXiv preprint arXiv:2311.14904","author":"Jain Naman","year":"2023","unstructured":"Naman Jain, Tianjun Zhang, Wei-Lin Chiang, Joseph E Gonzalez, Koushik Sen, and Ion Stoica. 2023. LLM-Assisted Code Cleaning For Training Accurate Code Generators. arXiv preprint arXiv:2311.14904 (2023)."},{"key":"e_1_3_2_1_61_1","volume-title":"Exploring the benefits of training expert language models over instruction tuning. arXiv preprint arXiv:2302.03202","author":"Jang Joel","year":"2023","unstructured":"Joel Jang, Seungone Kim, Seonghyeon Ye, Doyoung Kim, Lajanugen Logeswaran, Moontae Lee, Kyungjae Lee, and Minjoon Seo. 2023. Exploring the benefits of training expert language models over instruction tuning. arXiv preprint arXiv:2302.03202 (2023)."},{"key":"e_1_3_2_1_62_1","volume-title":"2023 a","author":"Jiang Huiqiang","year":"2023","unstructured":"Huiqiang Jiang, Qianhui Wu, Chin-Yew Lin, Yuqing Yang, and Lili Qiu. 2023 a. Llmlingua: Compressing prompts for accelerated inference of large language models. arXiv preprint arXiv:2310.05736 (2023)."},{"key":"e_1_3_2_1_63_1","volume-title":"2023 b. LongLLMLingua: Accelerating and Enhancing LLMs in Long Context Scenarios via Prompt Compression. arXiv preprint arXiv:2310.06839","author":"Jiang Huiqiang","year":"2023","unstructured":"Huiqiang Jiang, Qianhui Wu, Xufang Luo, Dongsheng Li, Chin-Yew Lin, Yuqing Yang, and Lili Qiu. 2023 b. LongLLMLingua: Accelerating and Enhancing LLMs in Long Context Scenarios via Prompt Compression. arXiv preprint arXiv:2310.06839 (2023)."},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1145\/3035918.3035933"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1145\/3183713.3196894"},{"key":"e_1_3_2_1_66_1","volume-title":"The MiniPile Challenge for Data-Efficient Language Models. arXiv preprint arXiv:2304.08442","author":"Kaddour Jean","year":"2023","unstructured":"Jean Kaddour. 2023. The MiniPile Challenge for Data-Efficient Language Models. arXiv preprint arXiv:2304.08442 (2023)."},{"key":"e_1_3_2_1_67_1","volume-title":"International Conference on Machine Learning. PMLR, 10697--10707","author":"Kandpal Nikhil","year":"2022","unstructured":"Nikhil Kandpal, Eric Wallace, and Colin Raffel. 2022. Deduplicating training data mitigates privacy risks in language models. In International Conference on Machine Learning. PMLR, 10697--10707."},{"key":"e_1_3_2_1_68_1","volume-title":"Scaling laws for neural language models. arXiv preprint arXiv:2001.08361","author":"Kaplan Jared","year":"2020","unstructured":"Jared Kaplan, Sam McCandlish, Tom Henighan, Tom B Brown, Benjamin Chess, Rewon Child, Scott Gray, Alec Radford, Jeffrey Wu, and Dario Amodei. 2020. Scaling laws for neural language models. arXiv preprint arXiv:2001.08361 (2020)."},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1145\/3448016.3457543"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.14778\/3611540.3611575"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00447"},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2203.05115"},{"key":"e_1_3_2_1_74_1","volume-title":"Beyond Scale: the Diversity Coefficient as a Data Quality Metric Demonstrates LLMs are Pre-trained on Formally Diverse Data. arXiv preprint arXiv:2306.13840","author":"Lee Alycia","year":"2023","unstructured":"Alycia Lee, Brando Miranda, and Sanmi Koyejo. 2023. Beyond Scale: the Diversity Coefficient as a Data Quality Metric Demonstrates LLMs are Pre-trained on Formally Diverse Data. arXiv preprint arXiv:2306.13840 (2023)."},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.577"},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.1145\/3035918.3054776"},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1145\/3448016.3457542"},{"key":"e_1_3_2_1_78_1","volume-title":"2023 b. Multi-step jailbreaking privacy attacks on chatgpt. arXiv preprint arXiv:2304.05197","author":"Li Haoran","year":"2023","unstructured":"Haoran Li, Dadi Guo, Wei Fan, Mingshi Xu, and Yangqiu Song. 2023 b. Multi-step jailbreaking privacy attacks on chatgpt. arXiv preprint arXiv:2304.05197 (2023)."},{"key":"e_1_3_2_1_79_1","volume-title":"2023 c. From quantity to quality: Boosting llm performance with self-guided data selection for instruction tuning. arXiv preprint arXiv:2308.12032","author":"Li Ming","year":"2023","unstructured":"Ming Li, Yong Zhang, Zhitao Li, Jiuhai Chen, Lichang Chen, Ning Cheng, Jianzong Wang, Tianyi Zhou, and Jing Xiao. 2023 c. From quantity to quality: Boosting llm performance with self-guided data selection for instruction tuning. arXiv preprint arXiv:2308.12032 (2023)."},{"key":"e_1_3_2_1_80_1","volume-title":"PyTorch Distributed: Experiences on Accelerating Data Parallel Training. Proceedings of the VLDB Endowment","volume":"13","author":"Li Shen","unstructured":"Shen Li, Yanli Zhao, Rohan Varma, Omkar Salpekar, Pieter Noordhuis, Teng Li, Adam Paszke, Jeff Smith, Brian Vaughan, Pritam Damania, et al. [n.,d.]. PyTorch Distributed: Experiences on Accelerating Data Parallel Training. Proceedings of the VLDB Endowment, Vol. 13, 12 ( [n.,d.])."},{"key":"e_1_3_2_1_81_1","volume-title":"Suriya Gunasekar, and Yin Tat Lee. 2023 a. Textbooks are all you need ii: phi-1.5 technical report. arXiv preprint arXiv:2309.05463","author":"Li Yuanzhi","year":"2023","unstructured":"Yuanzhi Li, S\u00e9bastien Bubeck, Ronen Eldan, Allie Del Giorno, Suriya Gunasekar, and Yin Tat Lee. 2023 a. Textbooks are all you need ii: phi-1.5 technical report. arXiv preprint arXiv:2309.05463 (2023)."},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"publisher","DOI":"10.14778\/3421424.3421431"},{"key":"e_1_3_2_1_83_1","doi-asserted-by":"publisher","DOI":"10.14778\/3476311.3476403"},{"key":"e_1_3_2_1_84_1","volume-title":"MixKD: Towards Efficient Distillation of Large-scale Language Models. In International Conference on Learning Representations.","author":"Liang Kevin J","year":"2020","unstructured":"Kevin J Liang, Weituo Hao, Dinghan Shen, Yufan Zhou, Weizhu Chen, Changyou Chen, and Lawrence Carin. 2020. MixKD: Towards Efficient Distillation of Large-scale Language Models. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_85_1","volume-title":"AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration. arXiv preprint arXiv:2306.00978","author":"Lin Ji","year":"2023","unstructured":"Ji Lin, Jiaming Tang, Haotian Tang, Shang Yang, Xingyu Dang, and Song Han. 2023. AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration. arXiv preprint arXiv:2306.00978 (2023)."},{"key":"e_1_3_2_1_86_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.655"},{"key":"e_1_3_2_1_87_1","volume-title":"2023 b. CacheGen: Fast Context Loading for Language Model Applications. arXiv preprint arXiv:2310.07240","author":"Liu Yuhan","year":"2023","unstructured":"Yuhan Liu, Hanchen Li, Kuntai Du, Jiayi Yao, Yihua Cheng, Yuyang Huang, Shan Lu, Michael Maire, Henry Hoffmann, Ari Holtzman, et al. 2023 b. CacheGen: Fast Context Loading for Language Model Applications. arXiv preprint arXiv:2310.07240 (2023)."},{"key":"e_1_3_2_1_88_1","volume-title":"2023 a. Scissorhands: Exploiting the Persistence of Importance Hypothesis for LLM KV Cache Compression at Test Time. arXiv preprint arXiv:2305.17118","author":"Liu Zichang","year":"2023","unstructured":"Zichang Liu, Aditya Desai, Fangshuo Liao, Weitao Wang, Victor Xie, Zhaozhuo Xu, Anastasios Kyrillidis, and Anshumali Shrivastava. 2023 a. Scissorhands: Exploiting the Persistence of Importance Hypothesis for LLM KV Cache Compression at Test Time. arXiv preprint arXiv:2305.17118 (2023)."},{"key":"e_1_3_2_1_89_1","volume-title":"2023 d. LLM-QAT: Data-Free Quantization Aware Training for Large Language Models. arXiv preprint arXiv:2305.17888","author":"Liu Zechun","year":"2023","unstructured":"Zechun Liu, Barlas Oguz, Changsheng Zhao, Ernie Chang, Pierre Stock, Yashar Mehdad, Yangyang Shi, Raghuraman Krishnamoorthi, and Vikas Chandra. 2023 d. LLM-QAT: Data-Free Quantization Aware Training for Large Language Models. arXiv preprint arXiv:2305.17888 (2023)."},{"key":"e_1_3_2_1_90_1","volume-title":"International Conference on Machine Learning. PMLR, 22137--22176","author":"Liu Zichang","year":"2023","unstructured":"Zichang Liu, Jue Wang, Tri Dao, Tianyi Zhou, Binhang Yuan, Zhao Song, Anshumali Shrivastava, Ce Zhang, Yuandong Tian, Christopher Re, et al. 2023 e. Deja vu: Contextual sparsity for efficient llms at inference time. In International Conference on Machine Learning. PMLR, 22137--22176."},{"key":"e_1_3_2_1_91_1","volume-title":"Yi Tay, Denny Zhou, Quoc V Le, Barret Zoph, Jason Wei, et al. 2023 a. The flan collection: Designing data and methods for effective instruction tuning. arXiv preprint arXiv:2301.13688","author":"Longpre Shayne","year":"2023","unstructured":"Shayne Longpre, Le Hou, Tu Vu, Albert Webson, Hyung Won Chung, Yi Tay, Denny Zhou, Quoc V Le, Barret Zoph, Jason Wei, et al. 2023 a. The flan collection: Designing data and methods for effective instruction tuning. arXiv preprint arXiv:2301.13688 (2023)."},{"key":"e_1_3_2_1_92_1","volume-title":"Domain Coverage, Quality, & Toxicity. arXiv preprint arXiv:2305.13169","author":"Longpre Shayne","year":"2023","unstructured":"Shayne Longpre, Gregory Yauney, Emily Reif, Katherine Lee, Adam Roberts, Barret Zoph, Denny Zhou, Jason Wei, Kevin Robinson, David Mimno, et al. 2023 b. A Pretrainer's Guide to Training Data: Measuring the Effects of Data Age, Domain Coverage, Quality, & Toxicity. arXiv preprint arXiv:2305.13169 (2023)."},{"key":"e_1_3_2_1_93_1","volume-title":"Instruction Tagging for Analyzing Supervised Fine-tuning of Large Language Models. arXiv e-prints","author":"Lu Keming","year":"2023","unstructured":"Keming Lu, Hongyi Yuan, Zheng Yuan, Runji Lin, Junyang Lin, Chuanqi Tan, Chang Zhou, and Jingren Zhou. 2023. # InsTag: Instruction Tagging for Analyzing Supervised Fine-tuning of Large Language Models. arXiv e-prints (2023), arXiv--2308."},{"key":"e_1_3_2_1_94_1","volume-title":"What's in the Box? A Preliminary Analysis of Undesirable Content in the Common Crawl Corpus. arXiv preprint arXiv:2105.02732","author":"Luccioni Alexandra Sasha","year":"2021","unstructured":"Alexandra Sasha Luccioni and Joseph D Viviano. 2021. What's in the Box? A Preliminary Analysis of Undesirable Content in the Common Crawl Corpus. arXiv preprint arXiv:2105.02732 (2021)."},{"key":"e_1_3_2_1_95_1","volume-title":"D2 pruning: Message passing for balancing diversity and difficulty in data pruning. arXiv preprint arXiv:2310.07931","author":"Maharana Adyasha","year":"2023","unstructured":"Adyasha Maharana, Prateek Yadav, and Mohit Bansal. 2023. D2 pruning: Message passing for balancing diversity and difficulty in data pruning. arXiv preprint arXiv:2310.07931 (2023)."},{"key":"e_1_3_2_1_96_1","volume-title":"When less is more: Investigating data pruning for pretraining llms at scale. arXiv preprint arXiv:2309.04564","author":"Marion Max","year":"2023","unstructured":"Max Marion, Ahmet \u00dcst\u00fcn, Luiza Pozzobon, Alex Wang, Marzieh Fadaee, and Sara Hooker. 2023. When less is more: Investigating data pruning for pretraining llms at scale. arXiv preprint arXiv:2309.04564 (2023)."},{"key":"e_1_3_2_1_97_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.132"},{"key":"e_1_3_2_1_98_1","doi-asserted-by":"publisher","DOI":"10.1145\/3448016.3452773"},{"key":"e_1_3_2_1_99_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640411"},{"key":"e_1_3_2_1_100_1","doi-asserted-by":"publisher","DOI":"10.14778\/3598581.3598604"},{"key":"e_1_3_2_1_101_1","doi-asserted-by":"publisher","DOI":"10.14778\/3570690.3570697"},{"key":"e_1_3_2_1_102_1","volume-title":"Landmark Attention: Random-Access Infinite Context Length for Transformers. arXiv preprint arXiv:2305.16300","author":"Mohtashami Amirkeivan","year":"2023","unstructured":"Amirkeivan Mohtashami and Martin Jaggi. 2023. Landmark Attention: Random-Access Infinite Context Length for Transformers. arXiv preprint arXiv:2305.16300 (2023)."},{"key":"e_1_3_2_1_103_1","doi-asserted-by":"publisher","DOI":"10.1145\/3582515.3609536"},{"key":"e_1_3_2_1_104_1","doi-asserted-by":"publisher","DOI":"10.5555\/1858842.1858883"},{"key":"e_1_3_2_1_105_1","volume-title":"Xiang Lisa Li, and Noah Goodman","author":"Mu Jesse","year":"2023","unstructured":"Jesse Mu, Xiang Lisa Li, and Noah Goodman. 2023. Learning to compress prompts with gist tokens. arXiv preprint arXiv:2304.08467 (2023)."},{"key":"e_1_3_2_1_106_1","volume-title":"Aleksandra Piktus, Nouamane Tazi, Sampo Pyysalo, Thomas Wolf, and Colin Raffel.","author":"Muennighoff Niklas","year":"2023","unstructured":"Niklas Muennighoff, Alexander M Rush, Boaz Barak, Teven Le Scao, Aleksandra Piktus, Nouamane Tazi, Sampo Pyysalo, Thomas Wolf, and Colin Raffel. 2023. Scaling Data-Constrained Language Models. arXiv preprint arXiv:2305.16264 (2023)."},{"key":"e_1_3_2_1_107_1","doi-asserted-by":"publisher","DOI":"10.14778\/3407790.3407816"},{"key":"e_1_3_2_1_108_1","doi-asserted-by":"publisher","DOI":"10.14778\/3574245.3574258"},{"key":"e_1_3_2_1_109_1","doi-asserted-by":"publisher","DOI":"10.1145\/3514221.3522567"},{"key":"e_1_3_2_1_110_1","doi-asserted-by":"publisher","DOI":"10.14778\/3611540.3611564"},{"key":"e_1_3_2_1_111_1","doi-asserted-by":"publisher","DOI":"10.1145\/3588964"},{"key":"e_1_3_2_1_112_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE53745.2022.00241"},{"key":"e_1_3_2_1_113_1","volume-title":"Codegen2: Lessons for training llms on programming and natural languages. arXiv preprint arXiv:2305.02309","author":"Nijkamp Erik","year":"2023","unstructured":"Erik Nijkamp, Hiroaki Hayashi, Caiming Xiong, Silvio Savarese, and Yingbo Zhou. 2023. Codegen2: Lessons for training llms on programming and natural languages. arXiv preprint arXiv:2305.02309 (2023)."},{"key":"e_1_3_2_1_114_1","unstructured":"OpenAI. 2023. GPT-4 Technical Report. arxiv: 2303.08774 [cs.CL]"},{"key":"e_1_3_2_1_115_1","doi-asserted-by":"publisher","DOI":"10.14778\/3476311.3476402"},{"key":"e_1_3_2_1_116_1","first-page":"27730","article-title":"Training language models to follow instructions with human feedback","volume":"35","author":"Ouyang Long","year":"2022","unstructured":"Long Ouyang, Jeffrey Wu, Xu Jiang, Diogo Almeida, Carroll Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, et al. 2022. Training language models to follow instructions with human feedback. Advances in Neural Information Processing Systems , Vol. 35 (2022), 27730--27744.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_117_1","volume-title":"In-context unlearning: Language models as few shot unlearners. arXiv preprint arXiv:2310.07579","author":"Pawelczyk Martin","year":"2023","unstructured":"Martin Pawelczyk, Seth Neel, and Himabindu Lakkaraju. 2023. In-context unlearning: Language models as few shot unlearners. arXiv preprint arXiv:2310.07579 (2023)."},{"key":"e_1_3_2_1_118_1","volume-title":"The RefinedWeb Dataset for Falcon LLM: Outperforming Curated Corpora with Web Data Only. In Thirty-seventh Conference on Neural Information Processing Systems Datasets and Benchmarks Track.","author":"Penedo Guilherme","year":"2023","unstructured":"Guilherme Penedo, Quentin Malartic, Daniel Hesslow, Ruxandra Cojocaru, Hamza Alobeidli, Alessandro Cappelli, Baptiste Pannier, Ebtesam Almazrouei, and Julien Launay. 2023. The RefinedWeb Dataset for Falcon LLM: Outperforming Curated Corpora with Web Data Only. In Thirty-seventh Conference on Neural Information Processing Systems Datasets and Benchmarks Track."},{"key":"e_1_3_2_1_119_1","volume-title":"Measuring and Narrowing the Compositionality Gap in Language Models. In Findings of the Association for Computational Linguistics: EMNLP 2023","author":"Press Ofir","year":"2023","unstructured":"Ofir Press, Muru Zhang, Sewon Min, Ludwig Schmidt, Noah A. Smith, and Mike Lewis. 2023. Measuring and Narrowing the Compositionality Gap in Language Models. In Findings of the Association for Computational Linguistics: EMNLP 2023, Singapore, December 6--10, 2023, Houda Bouamor, Juan Pino, and Kalika Bali (Eds.). Association for Computational Linguistics, 5687--5711. https:\/\/aclanthology.org\/2023.findings-emnlp.378"},{"key":"e_1_3_2_1_120_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476205"},{"key":"e_1_3_2_1_121_1","volume-title":"2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Ren Jie","year":"2021","unstructured":"Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, and Yuxiong He. 2021. $$ZeRO-Offload$$: Democratizing $$Billion-Scale$$ model training. In 2021 USENIX Annual Technical Conference (USENIX ATC 21). 551--564."},{"key":"e_1_3_2_1_122_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.794"},{"key":"e_1_3_2_1_123_1","volume-title":"Nicola De Cao, and Paolo Papotti","author":"Saeed Mohammed","year":"2024","unstructured":"Mohammed Saeed, Nicola De Cao, and Paolo Papotti. 2024. Querying Large Language Models with SQL. EDBT (Vision paper) (2024)."},{"key":"e_1_3_2_1_124_1","doi-asserted-by":"publisher","DOI":"10.14778\/3352063.3352083"},{"key":"e_1_3_2_1_125_1","volume-title":"Multitask Prompted Training Enables Zero-Shot Task Generalization. In ICLR 2022-Tenth International Conference on Learning Representations.","author":"Sanh Victor","year":"2022","unstructured":"Victor Sanh, Albert Webson, Colin Raffel, Stephen H Bach, Lintang Sutawika, Zaid Alyafeai, Antoine Chaffin, Arnaud Stiegler, Teven Le Scao, Arun Raja, et al. 2022. Multitask Prompted Training Enables Zero-Shot Task Generalization. In ICLR 2022-Tenth International Conference on Learning Representations."},{"key":"e_1_3_2_1_126_1","volume-title":"SlimPajama-DC: Understanding Data Combinations for LLM Training. arXiv preprint arXiv:2309.10818","author":"Shen Zhiqiang","year":"2023","unstructured":"Zhiqiang Shen, Tianhua Tao, Liqun Ma, Willie Neiswanger, Joel Hestness, Natalia Vassilieva, Daria Soboleva, and Eric Xing. 2023. SlimPajama-DC: Understanding Data Combinations for LLM Training. arXiv preprint arXiv:2309.10818 (2023)."},{"key":"e_1_3_2_1_127_1","unstructured":"Ying Sheng Shiyi Cao Dacheng Li Coleman Hooper Nicholas Lee Shuo Yang Christopher Chou Banghua Zhu Lianmin Zheng Kurt Keutzer et al. 2023. S-LoRA: Serving Thousands of Concurrent LoRA Adapters. arXiv preprint arXiv:2311.03285 (2023)."},{"key":"e_1_3_2_1_128_1","volume-title":"Detecting pretraining data from large language models. arXiv preprint arXiv:2310.16789","author":"Shi Weijia","year":"2023","unstructured":"Weijia Shi, Anirudh Ajith, Mengzhou Xia, Yangsibo Huang, Daogao Liu, Terra Blevins, Danqi Chen, and Luke Zettlemoyer. 2023. Detecting pretraining data from large language models. arXiv preprint arXiv:2310.16789 (2023)."},{"key":"e_1_3_2_1_129_1","volume-title":"Knowledge Unlearning for LLMs: Tasks, Methods, and Challenges. arXiv preprint arXiv:2311.15766","author":"Si Nianwen","year":"2023","unstructured":"Nianwen Si, Hao Zhang, Heyu Chang, Wenlin Zhang, Dan Qu, and Weiqiang Zhang. 2023. Knowledge Unlearning for LLMs: Tasks, Methods, and Challenges. arXiv preprint arXiv:2311.15766 (2023)."},{"key":"e_1_3_2_1_130_1","volume-title":"International Conference on Learning Representations (Forthcoming). Download Citation BibTex Tagged XML Download Paper","volume":"332","author":"Silcock Emily","unstructured":"Emily Silcock, Luca D'Amico-Wong, Jinglin Yang, and Melissa Dell. [n.,d.]. Noise-Robust De-Duplication at Scale. In International Conference on Learning Representations (Forthcoming). Download Citation BibTex Tagged XML Download Paper, Vol. 332."},{"key":"e_1_3_2_1_131_1","volume-title":"Nathan Scales, Ajay Tanwani, Heather Cole-Lewis, Stephen Pfohl, et al.","author":"Singhal Karan","year":"2023","unstructured":"Karan Singhal, Shekoofeh Azizi, Tao Tu, S Sara Mahdavi, Jason Wei, Hyung Won Chung, Nathan Scales, Ajay Tanwani, Heather Cole-Lewis, Stephen Pfohl, et al. 2023. Large language models encode clinical knowledge. Nature, Vol. 620, 7972 (2023), 172--180."},{"key":"e_1_3_2_1_132_1","volume-title":"Proceedings of the 2022 International Conference on Management of Data. 1493--1503","author":"Suhara Yoshihiko","unstructured":"Yoshihiko Suhara, Jinfeng Li, Yuliang Li, Dan Zhang, cC aug atay Demiralp, Chen Chen, and Wang-Chiew Tan. 2022. Annotating columns with pre-trained language models. In Proceedings of the 2022 International Conference on Management of Data. 1493--1503."},{"key":"e_1_3_2_1_133_1","volume-title":"Distilling task-specific knowledge from bert into simple neural networks. arXiv preprint arXiv:1903.12136","author":"Tang Raphael","year":"2019","unstructured":"Raphael Tang, Yao Lu, Linqing Liu, Lili Mou, Olga Vechtomova, and Jimmy Lin. 2019. Distilling task-specific knowledge from bert into simple neural networks. arXiv preprint arXiv:1903.12136 (2019)."},{"key":"e_1_3_2_1_134_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.125"},{"key":"e_1_3_2_1_135_1","doi-asserted-by":"publisher","DOI":"10.14778\/3447689.3447706"},{"key":"e_1_3_2_1_136_1","volume-title":"Thirty-seventh Conference on Neural Information Processing Systems Datasets and Benchmarks Track.","author":"Tirumala Kushal","year":"2023","unstructured":"Kushal Tirumala, Daniel Simig, Armen Aghajanyan, and Ari S Morcos. 2023. D4: Improving LLM Pretraining via Document De-Duplication and Diversification. In Thirty-seventh Conference on Neural Information Processing Systems Datasets and Benchmarks Track."},{"key":"e_1_3_2_1_137_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_138_1","doi-asserted-by":"publisher","DOI":"10.14778\/3551793.3551841"},{"key":"e_1_3_2_1_139_1","doi-asserted-by":"publisher","DOI":"10.1145\/3514221.3517843"},{"key":"e_1_3_2_1_140_1","doi-asserted-by":"publisher","DOI":"10.14778\/3554821.3554896"},{"key":"e_1_3_2_1_141_1","doi-asserted-by":"publisher","DOI":"10.14778\/3579075.3579083"},{"key":"e_1_3_2_1_142_1","unstructured":"Tim Valicenti Justice Vidal and Ritik Patnaik. 2023. Mini-GPTs: Efficient Large Language Models through Contextual Pruning. arxiv: 2312.12682 [cs.CL]"},{"key":"e_1_3_2_1_143_1","volume-title":"Protect Your Prompts: Protocols for IP Protection in LLM Applications. arXiv preprint arXiv:2306.06297","author":"van Wyk MA","year":"2023","unstructured":"MA van Wyk, M Bekker, XL Richards, and KJ Nixon. 2023. Protect Your Prompts: Protocols for IP Protection in LLM Applications. arXiv preprint arXiv:2306.06297 (2023)."},{"key":"e_1_3_2_1_144_1","volume-title":"Manoel Horta Ribeiro, and Robert West","author":"Veselovsky Veniamin","year":"2023","unstructured":"Veniamin Veselovsky, Manoel Horta Ribeiro, and Robert West. 2023. Artificial Artificial Artificial Intelligence: Crowd Workers Widely Use Large Language Models for Text Production Tasks. arXiv preprint arXiv:2306.07899 (2023)."},{"key":"e_1_3_2_1_145_1","doi-asserted-by":"publisher","DOI":"10.1145\/3543873.3587595"},{"key":"e_1_3_2_1_146_1","volume-title":"Will we run out of data? An analysis of the limits of scaling datasets in Machine Learning. arXiv preprint arXiv:2211.04325","author":"Villalobos Pablo","year":"2022","unstructured":"Pablo Villalobos, Jaime Sevilla, Lennart Heim, Tamay Besiroglu, Marius Hobbhahn, and Anson Ho. 2022. Will we run out of data? An analysis of the limits of scaling datasets in Machine Learning. arXiv preprint arXiv:2211.04325 (2022)."},{"key":"e_1_3_2_1_147_1","volume-title":"Freshllms: Refreshing large language models with search engine augmentation. arXiv preprint arXiv:2310.03214","author":"Vu Tu","year":"2023","unstructured":"Tu Vu, Mohit Iyyer, Xuezhi Wang, Noah Constant, Jerry Wei, Jason Wei, Chris Tar, Yun-Hsuan Sung, Denny Zhou, Quoc Le, et al. 2023. Freshllms: Refreshing large language models with search engine augmentation. arXiv preprint arXiv:2310.03214 (2023)."},{"key":"e_1_3_2_1_148_1","volume-title":"Explore-Instruct: Enhancing Domain-Specific Instruction Coverage through Active Exploration. arXiv preprint arXiv:2310.09168","author":"Wan Fanqi","year":"2023","unstructured":"Fanqi Wan, Xinting Huang, Tao Yang, Xiaojun Quan, Wei Bi, and Shuming Shi. 2023. Explore-Instruct: Enhancing Domain-Specific Instruction Coverage through Active Exploration. arXiv preprint arXiv:2310.09168 (2023)."},{"key":"e_1_3_2_1_149_1","volume-title":"2023 a. Openchat: Advancing open-source language models with mixed-quality data. arXiv preprint arXiv:2309.11235","author":"Wang Guan","year":"2023","unstructured":"Guan Wang, Sijie Cheng, Xianyuan Zhan, Xiangang Li, Sen Song, and Yang Liu. 2023 a. Openchat: Advancing open-source language models with mixed-quality data. arXiv preprint arXiv:2309.11235 (2023)."},{"key":"e_1_3_2_1_150_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-emnlp.354"},{"key":"e_1_3_2_1_151_1","volume-title":"David Wadden","author":"Wang Yizhong","year":"2023","unstructured":"Yizhong Wang, Hamish Ivison, Pradeep Dasigi, Jack Hessel, Tushar Khot, Khyathi Raghavi Chandu, David Wadden, Kelsey MacMillan, Noah A Smith, Iz Beltagy, et al. 2023 b. How Far Can Camels Go? Exploring the State of Instruction Tuning on Open Resources. arXiv preprint arXiv:2306.04751 (2023)."},{"key":"e_1_3_2_1_152_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.340"},{"key":"e_1_3_2_1_153_1","volume-title":"Thirty-seventh Conference on Neural Information Processing Systems.","author":"Wei Alexander","year":"2023","unstructured":"Alexander Wei, Nika Haghtalab, and Jacob Steinhardt. 2023. Jailbroken: How Does LLM Safety Training Fail?. In Thirty-seventh Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_1_154_1","volume-title":"Kirsty Anderson, Pushmeet Kohli, Ben Coppin, and Po-Sen Huang.","author":"Welbl Johannes","year":"2021","unstructured":"Johannes Welbl, Amelia Glaese, Jonathan Uesato, Sumanth Dathathri, John Mellor, Lisa Anne Hendricks, Kirsty Anderson, Pushmeet Kohli, Ben Coppin, and Po-Sen Huang. 2021. Challenges in detoxifying language models. arXiv preprint arXiv:2109.07445 (2021)."},{"key":"e_1_3_2_1_155_1","doi-asserted-by":"publisher","DOI":"10.14778\/3415478.3415562"},{"key":"e_1_3_2_1_156_1","first-page":"307","article-title":"Attention-based learning for missing data imputation in HoloClean","volume":"2","author":"Wu Richard","year":"2020","unstructured":"Richard Wu, Aoqian Zhang, Ihab Ilyas, and Theodoros Rekatsinas. 2020. Attention-based learning for missing data imputation in HoloClean. Proceedings of Machine Learning and Systems , Vol. 2 (2020), 307--325.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_157_1","volume-title":"2023 a. Bloomberggpt: A large language model for finance. arXiv preprint arXiv:2303.17564","author":"Wu Shijie","year":"2023","unstructured":"Shijie Wu, Ozan Irsoy, Steven Lu, Vadim Dabravolski, Mark Dredze, Sebastian Gehrmann, Prabhanjan Kambadur, David Rosenberg, and Gideon Mann. 2023 a. Bloomberggpt: A large language model for finance. arXiv preprint arXiv:2303.17564 (2023)."},{"key":"e_1_3_2_1_158_1","volume-title":"2023 b. Self-Evolved Diverse Data Sampling for Efficient Instruction Tuning. arXiv preprint arXiv:2311.08182","author":"Wu Shengguang","year":"2023","unstructured":"Shengguang Wu, Keming Lu, Benfeng Xu, Junyang Lin, Qi Su, and Chang Zhou. 2023 b. Self-Evolved Diverse Data Sampling for Efficient Instruction Tuning. arXiv preprint arXiv:2311.08182 (2023)."},{"key":"e_1_3_2_1_159_1","unstructured":"Haojun Xia Zhen Zheng Yuchao Li Donglin Zhuang Zhongzhu Zhou Xiafei Qiu Yong Li Wei Lin and Shuaiwen Leon Song. [n. d.]. Flash-LLM: Enabling Cost-Effective and Highly-Efficient Large Generative Model Inference with Unstructured Sparsity. ( [n. d.])."},{"key":"e_1_3_2_1_160_1","volume-title":"Efficient Streaming Language Models with Attention Sinks. arXiv preprint arXiv:2309.17453","author":"Xiao Guangxuan","year":"2023","unstructured":"Guangxuan Xiao, Yuandong Tian, Beidi Chen, Song Han, and Mike Lewis. 2023. Efficient Streaming Language Models with Attention Sinks. arXiv preprint arXiv:2309.17453 (2023)."},{"key":"e_1_3_2_1_161_1","volume-title":"2023 a. DoReMi: Optimizing Data Mixtures Speeds Up Language Model Pretraining. arXiv preprint arXiv:2305.10429","author":"Xie Sang Michael","year":"2023","unstructured":"Sang Michael Xie, Hieu Pham, Xuanyi Dong, Nan Du, Hanxiao Liu, Yifeng Lu, Percy Liang, Quoc V Le, Tengyu Ma, and Adams Wei Yu. 2023 a. DoReMi: Optimizing Data Mixtures Speeds Up Language Model Pretraining. arXiv preprint arXiv:2305.10429 (2023)."},{"key":"e_1_3_2_1_162_1","volume-title":"2023 b. Data selection for language models via importance resampling. arXiv preprint arXiv:2302.03169","author":"Xie Sang Michael","year":"2023","unstructured":"Sang Michael Xie, Shibani Santurkar, Tengyu Ma, and Percy Liang. 2023 b. Data selection for language models via importance resampling. arXiv preprint arXiv:2302.03169 (2023)."},{"key":"e_1_3_2_1_163_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.190"},{"key":"e_1_3_2_1_164_1","volume-title":"2023 b. Large Language Model Unlearning. arXiv preprint arXiv:2310.10683","author":"Yao Yuanshun","year":"2023","unstructured":"Yuanshun Yao, Xiaojun Xu, and Yang Liu. 2023 b. Large Language Model Unlearning. arXiv preprint arXiv:2310.10683 (2023)."},{"key":"e_1_3_2_1_165_1","volume-title":"2023 a. A comprehensive study on post-training quantization for large language models. arXiv preprint arXiv:2303.08302","author":"Yao Zhewei","year":"2023","unstructured":"Zhewei Yao, Cheng Li, Xiaoxia Wu, Stephen Youn, and Yuxiong He. 2023 a. A comprehensive study on post-training quantization for large language models. arXiv preprint arXiv:2303.08302 (2023)."},{"key":"e_1_3_2_1_166_1","volume-title":"Dynosaur: A Dynamic Growth Paradigm for Instruction-Tuning Data Curation. arXiv preprint arXiv:2305.14327","author":"Yin Da","year":"2023","unstructured":"Da Yin, Xiao Liu, Fan Yin, Ming Zhong, Hritik Bansal, Jiawei Han, and Kai-Wei Chang. 2023. Dynosaur: A Dynamic Growth Paradigm for Instruction-Tuning Data Curation. arXiv preprint arXiv:2305.14327 (2023)."},{"key":"e_1_3_2_1_167_1","volume-title":"2023 b. Right to be forgotten in the era of large language models: Implications, challenges, and solutions. arXiv preprint arXiv:2307.03941","author":"Zhang Dawen","year":"2023","unstructured":"Dawen Zhang, Pamela Finckenberg-Broman, Thong Hoang, Shidong Pan, Zhenchang Xing, Mark Staples, and Xiwei Xu. 2023 b. Right to be forgotten in the era of large language models: Implications, challenges, and solutions. arXiv preprint arXiv:2307.03941 (2023)."},{"key":"e_1_3_2_1_168_1","doi-asserted-by":"publisher","DOI":"10.14778\/3407790.3407793"},{"key":"e_1_3_2_1_169_1","volume-title":"2023 a. Instruction tuning for large language models: A survey. arXiv preprint arXiv:2308.10792","author":"Zhang Shengyu","year":"2023","unstructured":"Shengyu Zhang, Linfeng Dong, Xiaoya Li, Sen Zhang, Xiaofei Sun, Shuhe Wang, Jiwei Li, Runyi Hu, Tianwei Zhang, Fei Wu, et al. 2023 a. Instruction tuning for large language models: A survey. arXiv preprint arXiv:2308.10792 (2023)."},{"key":"e_1_3_2_1_170_1","doi-asserted-by":"publisher","DOI":"10.14778\/3467861.3467867"},{"key":"e_1_3_2_1_171_1","volume-title":"2023 c. H $ _2 $ O: Heavy-Hitter Oracle for Efficient Generative Inference of Large Language Models. arXiv preprint arXiv:2306.14048","author":"Zhang Zhenyu","year":"2023","unstructured":"Zhenyu Zhang, Ying Sheng, Tianyi Zhou, Tianlong Chen, Lianmin Zheng, Ruisi Cai, Zhao Song, Yuandong Tian, Christopher R\u00e9, Clark Barrett, et al. 2023 c. H $ _2 $ O: Heavy-Hitter Oracle for Efficient Generative Inference of Large Language Models. arXiv preprint arXiv:2306.14048 (2023)."},{"key":"e_1_3_2_1_172_1","unstructured":"Jiachen Zhao Zhun Deng David Madras James Zou and Mengye Ren. 2023 a. Learning and Forgetting Unsafe Examples in Large Language Models. arxiv: 2312.12736 [cs.CL]"},{"key":"e_1_3_2_1_173_1","volume-title":"2023 b. Automatic calibration and error correction for large language models via pareto optimal self-supervision. arXiv preprint arXiv:2306.16564","author":"Zhao Theodore","year":"2023","unstructured":"Theodore Zhao, Mu Wei, J Samuel Preston, and Hoifung Poon. 2023 b. Automatic calibration and error correction for large language models via pareto optimal self-supervision. arXiv preprint arXiv:2306.16564 (2023)."},{"key":"e_1_3_2_1_174_1","volume-title":"2023 c. Lima: Less is more for alignment. arXiv preprint arXiv:2305.11206","author":"Zhou Chunting","year":"2023","unstructured":"Chunting Zhou, Pengfei Liu, Puxin Xu, Srini Iyer, Jiao Sun, Yuning Mao, Xuezhe Ma, Avia Efrat, Ping Yu, Lili Yu, et al. 2023 c. Lima: Less is more for alignment. arXiv preprint arXiv:2305.11206 (2023)."},{"key":"e_1_3_2_1_175_1","volume-title":"2023 b. LoBaSS: Gauging Learnability in Supervised Fine-tuning Data. arXiv preprint arXiv:2310.13008","author":"Zhou Haotian","year":"2023","unstructured":"Haotian Zhou, Tingkai Liu, Qianli Ma, Jianbo Yuan, Pengfei Liu, Yang You, and Hongxia Yang. 2023 b. LoBaSS: Gauging Learnability in Supervised Fine-tuning Data. arXiv preprint arXiv:2310.13008 (2023)."},{"key":"e_1_3_2_1_176_1","volume-title":"2023 a. Oasis: Data Curation and Assessment System for Pretraining of Large Language Models. arXiv preprint arXiv:2311.12537","author":"Zhou Tong","year":"2023","unstructured":"Tong Zhou, Yubo Chen, Pengfei Cao, Kang Liu, Jun Zhao, and Shengping Liu. 2023 a. Oasis: Data Curation and Assessment System for Pretraining of Large Language Models. arXiv preprint arXiv:2311.12537 (2023)."},{"key":"e_1_3_2_1_177_1","volume-title":"2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Zhou Zhe","year":"2022","unstructured":"Zhe Zhou, Xuechao Wei, Jiejing Zhang, and Guangyu Sun. 2022. $$PetS$$: A Unified Framework for Parameter-Efficient Transformers Serving. In 2022 USENIX Annual Technical Conference (USENIX ATC 22). 489--504."}],"event":{"name":"SIGMOD\/PODS '24: International Conference on Management of Data","location":"Santiago AA Chile","acronym":"SIGMOD\/PODS '24","sponsor":["SIGMOD ACM Special Interest Group on Management of Data"]},"container-title":["Companion of the 2024 International Conference on Management of Data"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3626246.3654683","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3626246.3654683","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T11:32:32Z","timestamp":1755862352000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3626246.3654683"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,9]]},"references-count":177,"alternative-id":["10.1145\/3626246.3654683","10.1145\/3626246"],"URL":"https:\/\/doi.org\/10.1145\/3626246.3654683","relation":{},"subject":[],"published":{"date-parts":[[2024,6,9]]},"assertion":[{"value":"2024-06-09","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}