{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T17:19:45Z","timestamp":1765041585585,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,21]],"date-time":"2024-10-21T00:00:00Z","timestamp":1729468800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62102095"],"award-info":[{"award-number":["62102095"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,21]]},"DOI":"10.1145\/3627673.3679835","type":"proceedings-article","created":{"date-parts":[[2024,10,20]],"date-time":"2024-10-20T19:34:11Z","timestamp":1729452851000},"page":"2732-2741","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Source Prompt: Coordinated Pre-training of Language Models on Diverse Corpora from Multiple Sources"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-8789-2834","authenticated-orcid":false,"given":"Yipei","family":"Xu","sequence":"first","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-4937-0805","authenticated-orcid":false,"given":"Dakuan","family":"Lu","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0670-5602","authenticated-orcid":false,"given":"Jiaqing","family":"Liang","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9414-2696","authenticated-orcid":false,"given":"Jin","family":"Zhao","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1428-8677","authenticated-orcid":false,"given":"Xintao","family":"Wang","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-9589-0823","authenticated-orcid":false,"given":"Hengkui","family":"Wu","sequence":"additional","affiliation":[{"name":"Super Symmetry Technology, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8353-2523","authenticated-orcid":false,"given":"Ken","family":"Chen","sequence":"additional","affiliation":[{"name":"Super Symmetry Technology, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7108-2391","authenticated-orcid":false,"given":"Liujiang","family":"Liu","sequence":"additional","affiliation":[{"name":"Super Symmetry Technology, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-6460-7458","authenticated-orcid":false,"given":"Yingsi","family":"Xin","sequence":"additional","affiliation":[{"name":"Super Symmetry Technology, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-3007-6793","authenticated-orcid":false,"given":"Xuepeng","family":"Liu","sequence":"additional","affiliation":[{"name":"Super Symmetry Technology, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8403-9591","authenticated-orcid":false,"given":"Yanghua","family":"Xiao","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2355-288X","authenticated-orcid":false,"given":"Zhixu","family":"Li","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,21]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.692"},{"key":"e_1_3_2_1_2_1","unstructured":"Rohan Anil Andrew M Dai Orhan Firat Melvin Johnson Dmitry Lepikhin Alexandre Passos Siamak Shakeri Emanuel Taropa Paige Bailey Zhifeng Chen et al. 2023. Palm 2 technical report. arXiv preprint arXiv:2305.10403 (2023)."},{"key":"e_1_3_2_1_3_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. Advances in neural information processing systems Vol. 33 (2020) 1877--1901."},{"key":"e_1_3_2_1_4_1","volume-title":"Decision transformer: Reinforcement learning via sequence modeling. Advances in neural information processing systems","author":"Chen Lili","year":"2021","unstructured":"Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Misha Laskin, Pieter Abbeel, Aravind Srinivas, and Igor Mordatch. 2021. Decision transformer: Reinforcement learning via sequence modeling. Advances in neural information processing systems, Vol. 34 (2021), 15084--15097."},{"key":"e_1_3_2_1_5_1","unstructured":"Hyung Won Chung Le Hou Shayne Longpre Barret Zoph Yi Tay William Fedus Yunxuan Li Xuezhi Wang Mostafa Dehghani Siddhartha Brahma Albert Webson Shixiang Shane Gu Zhuyun Dai Mirac Suzgun Xinyun Chen Aakanksha Chowdhery Alex Castro-Ros Marie Pellat Kevin Robinson Dasha Valter Sharan Narang Gaurav Mishra Adams Yu Vincent Zhao Yanping Huang Andrew Dai Hongkun Yu Slav Petrov Ed H. Chi Jeff Dean Jacob Devlin Adam Roberts Denny Zhou Quoc V. Le and Jason Wei. 2022. Scaling Instruction-Finetuned Language Models. arxiv: 2210.11416 [cs.LG]"},{"key":"e_1_3_2_1_6_1","unstructured":"Together Computer. 2023. RedPajama-Data: An Open Source Recipe to Reproduce LLaMA training dataset. https:\/\/github.com\/togethercomputer\/RedPajama-Data"},{"key":"e_1_3_2_1_7_1","unstructured":"Tri Dao. 2023. FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning. (2023)."},{"key":"e_1_3_2_1_8_1","volume-title":"Benjamin Van Durme, and Jo ao Sedoc","author":"Feng Yukun","year":"2022","unstructured":"Yukun Feng, Patrick Xia, Benjamin Van Durme, and Jo ao Sedoc. 2022. Automatic Document Selection for Efficient Encoder Pretraining. arXiv preprint arXiv:2210.10951 (2022)."},{"key":"e_1_3_2_1_9_1","volume-title":"Controlling linguistic style aspects in neural language generation. arXiv preprint arXiv:1707.02633","author":"Ficler Jessica","year":"2017","unstructured":"Jessica Ficler and Yoav Goldberg. 2017. Controlling linguistic style aspects in neural language generation. arXiv preprint arXiv:1707.02633 (2017)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11023-020-09548-1"},{"key":"e_1_3_2_1_11_1","unstructured":"Leo Gao Stella Biderman Sid Black Laurence Golding Travis Hoppe Charles Foster Jason Phang Horace He Anish Thite Noa Nabeshima et al. 2020. The pile: An 800gb dataset of diverse text for language modeling. arXiv preprint arXiv:2101.00027 (2020)."},{"key":"e_1_3_2_1_12_1","unstructured":"Xinyang Geng and Hao Liu. 2023. OpenLLaMA: An Open Reproduction of LLaMA. https:\/\/github.com\/openlm-research\/open_llama"},{"key":"e_1_3_2_1_13_1","volume-title":"Maximizing stylistic control and semantic accuracy in nlg: Personality variation and discourse contrast. arXiv preprint arXiv:1907.09527","author":"Harrison Vrindavan","year":"2019","unstructured":"Vrindavan Harrison, Lena Reed, Shereen Oraby, and Marilyn Walker. 2019. Maximizing stylistic control and semantic accuracy in nlg: Personality variation and discourse contrast. arXiv preprint arXiv:1907.09527 (2019)."},{"key":"e_1_3_2_1_14_1","volume-title":"Nataraj Jammalamadaka, Jianyu Huang, Hector Yuen, et al.","author":"Kalamkar Dhiraj","year":"2019","unstructured":"Dhiraj Kalamkar, Dheevatsa Mudigere, Naveen Mellempudi, Dipankar Das, Kunal Banerjee, Sasikanth Avancha, Dharma Teja Vooturi, Nataraj Jammalamadaka, Jianyu Huang, Hector Yuen, et al. 2019. A study of BFLOAT16 for deep learning training. arXiv preprint arXiv:1905.12322 (2019)."},{"key":"e_1_3_2_1_15_1","volume-title":"Ctrl: A conditional transformer language model for controllable generation. arXiv preprint arXiv:1909.05858","author":"Keskar Nitish Shirish","year":"2019","unstructured":"Nitish Shirish Keskar, Bryan McCann, Lav R Varshney, Caiming Xiong, and Richard Socher. 2019. Ctrl: A conditional transformer language model for controllable generation. arXiv preprint arXiv:1909.05858 (2019)."},{"key":"e_1_3_2_1_16_1","volume-title":"International Conference on Machine Learning. PMLR, 5464--5474","author":"Killamsetty Krishnateja","year":"2021","unstructured":"Krishnateja Killamsetty, Sivasubramanian Durga, Ganesh Ramakrishnan, Abir De, and Rishabh Iyer. 2021. Grad-match: Gradient matching based data subset selection for efficient deep model training. In International Conference on Machine Learning. PMLR, 5464--5474."},{"key":"e_1_3_2_1_17_1","volume-title":"International Conference on Machine Learning. PMLR, 17506--17533","author":"Korbak Tomasz","year":"2023","unstructured":"Tomasz Korbak, Kejian Shi, Angelica Chen, Rasika Vinayak Bhalerao, Christopher Buckley, Jason Phang, Samuel R Bowman, and Ethan Perez. 2023. Pretraining language models with human preferences. In International Conference on Machine Learning. PMLR, 17506--17533."},{"key":"e_1_3_2_1_18_1","volume-title":"Training Data: Measuring the Effects of Data Age, Domain Coverage, Quality, & Toxicity. arXiv preprint arXiv:2305.13169","author":"Longpre Shayne","year":"2023","unstructured":"Shayne Longpre, Gregory Yauney, Emily Reif, Katherine Lee, Adam Roberts, Barret Zoph, Denny Zhou, Jason Wei, Kevin Robinson, David Mimno, et al. 2023. A Pretrainer's Guide to Training Data: Measuring the Effects of Data Age, Domain Coverage, Quality, & Toxicity. arXiv preprint arXiv:2305.13169 (2023)."},{"key":"e_1_3_2_1_19_1","volume-title":"Quark: Controllable text generation with reinforced unlearning. Advances in neural information processing systems","author":"Lu Ximing","year":"2022","unstructured":"Ximing Lu, Sean Welleck, Jack Hessel, Liwei Jiang, Lianhui Qin, Peter West, Prithviraj Ammanabrolu, and Yejin Choi. 2022. Quark: Controllable text generation with reinforced unlearning. Advances in neural information processing systems, Vol. 35 (2022), 27591--27609."},{"key":"e_1_3_2_1_20_1","volume-title":"International Conference on Machine Learning. PMLR, 15630--15649","author":"Mindermann S\u00f6ren","year":"2022","unstructured":"S\u00f6ren Mindermann, Jan M Brauner, Muhammed T Razzak, Mrinank Sharma, Andreas Kirsch, Winnie Xu, Benedikt H\u00f6ltgen, Aidan N Gomez, Adrien Morisot, Sebastian Farquhar, et al. 2022. Prioritized training on points that are learnable, worth learning, and not yet learnt. In International Conference on Machine Learning. PMLR, 15630--15649."},{"key":"e_1_3_2_1_21_1","first-page":"20596","article-title":"Deep learning on a data diet: Finding important examples early in training","volume":"34","author":"Paul Mansheej","year":"2021","unstructured":"Mansheej Paul, Surya Ganguli, and Gintare Karolina Dziugaite. 2021. Deep learning on a data diet: Finding important examples early in training. Advances in Neural Information Processing Systems, Vol. 34 (2021), 20596--20607.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_22_1","volume-title":"The RefinedWeb dataset for Falcon LLM: outperforming curated corpora with web data, and web data only. arXiv preprint arXiv:2306.01116","author":"Penedo Guilherme","year":"2023","unstructured":"Guilherme Penedo, Quentin Malartic, Daniel Hesslow, Ruxandra Cojocaru, Alessandro Cappelli, Hamza Alobeidli, Baptiste Pannier, Ebtesam Almazrouei, and Julien Launay. 2023. The RefinedWeb dataset for Falcon LLM: outperforming curated corpora with web data, and web data only. arXiv preprint arXiv:2306.01116 (2023)."},{"key":"e_1_3_2_1_23_1","unstructured":"Xue Bin Peng Aviral Kumar Grace Zhang and Sergey Levine. 2020. Advantage Weighted Regression: Simple and Scalable Off-Policy Reinforcement Learning. https:\/\/openreview.net\/forum?id=H1gdF34FvS"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/1273496.1273590"},{"key":"e_1_3_2_1_25_1","unstructured":"Alec Radford Jeff Wu Rewon Child David Luan Dario Amodei and Ilya Sutskever. 2019. Language Models are Unsupervised Multitask Learners. https:\/\/api.semanticscholar.org\/CorpusID:160025533"},{"key":"e_1_3_2_1_26_1","volume-title":"Liu","author":"Raffel Colin","year":"2019","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J. Liu. 2019. Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer. arXiv e-prints (2019). arxiv: 1910.10683"},{"key":"e_1_3_2_1_27_1","volume-title":"Liu","author":"Raffel Colin","year":"2023","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J. Liu. 2023. Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer. arxiv: 1910.10683 [cs.LG]"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W18-6323"},{"key":"e_1_3_2_1_30_1","first-page":"19523","article-title":"Beyond neural scaling laws: beating power law scaling via data pruning","volume":"35","author":"Sorscher Ben","year":"2022","unstructured":"Ben Sorscher, Robert Geirhos, Shashank Shekhar, Surya Ganguli, and Ari Morcos. 2022. Beyond neural scaling laws: beating power law scaling via data pruning. Advances in Neural Information Processing Systems, Vol. 35 (2022), 19523--19536.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_31_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W18-6314"},{"key":"e_1_3_2_1_33_1","volume-title":"Nghi D. Q. Bui, Junnan Li, and Steven C. H. Hoi.","author":"Wang Yue","year":"2023","unstructured":"Yue Wang, Hung Le, Akhilesh Deepak Gotmare, Nghi D. Q. Bui, Junnan Li, and Steven C. H. Hoi. 2023. CodeT5: Open Code Large Language Models for Code Understanding and Generation. arxiv: 2305.07922 [cs.CL]"},{"key":"e_1_3_2_1_34_1","unstructured":"Shaohua Wu Xudong Zhao Tong Yu Rongguo Zhang Chong Shen Hongli Liu Feng Li Hong Zhu Jiangang Luo Liang Xu et al. 2021. Yuan 1.0: Large-scale pre-trained language model in zero-shot and few-shot learning. arXiv preprint arXiv:2110.04725 (2021)."},{"key":"e_1_3_2_1_35_1","volume-title":"Data selection for language models via importance resampling. arXiv preprint arXiv:2302.03169","author":"Xie Sang Michael","year":"2023","unstructured":"Sang Michael Xie, Shibani Santurkar, Tengyu Ma, and Percy Liang. 2023. Data selection for language models via importance resampling. arXiv preprint arXiv:2302.03169 (2023)."},{"key":"e_1_3_2_1_36_1","volume-title":"CLUE: A Chinese language understanding evaluation benchmark. arXiv preprint arXiv:2004.05986","author":"Xu Liang","year":"2020","unstructured":"Liang Xu, Hai Hu, Xuanwei Zhang, Lu Li, Chenjie Cao, Yudong Li, Yechen Xu, Kai Sun, Dian Yu, Cong Yu, et al. 2020. CLUE: A Chinese language understanding evaluation benchmark. arXiv preprint arXiv:2004.05986 (2020)."},{"key":"e_1_3_2_1_37_1","volume-title":"CLUECorpus2020: A Large-scale Chinese Corpus for Pre-training Language Model. ArXiv","author":"Xu Liang","year":"2020","unstructured":"Liang Xu, Xuanwei Zhang, and Qianqian Dong. 2020. CLUECorpus2020: A Large-scale Chinese Corpus for Pre-training Language Model. ArXiv, Vol. abs\/2003.01355 (2020)."},{"key":"e_1_3_2_1_38_1","volume-title":"mT5: A massively multilingual pre-trained text-to-text transformer. arxiv","author":"Xue Linting","year":"2010","unstructured":"Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, and Colin Raffel. 2021. mT5: A massively multilingual pre-trained text-to-text transformer. arxiv: 2010.11934 [cs.CL]"},{"key":"e_1_3_2_1_39_1","unstructured":"Aiyuan Yang Bin Xiao Bingning Wang Borong Zhang Ce Bian Chao Yin Chenxu Lv Da Pan Dian Wang Dong Yan et al. 2023. Baichuan 2: Open large-scale language models. arXiv preprint arXiv:2309.10305 (2023)."},{"key":"e_1_3_2_1_40_1","volume-title":"International Conference on Machine Learning. PMLR, 25438--25451","author":"Yao Xingcheng","year":"2022","unstructured":"Xingcheng Yao, Yanan Zheng, Xiaocong Yang, and Zhilin Yang. 2022. Nlp from scratch without large-scale pretraining: A simple and efficient framework. In International Conference on Machine Learning. PMLR, 25438--25451."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.aiopen.2021.06.001"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.aiopen.2021.12.003"},{"key":"e_1_3_2_1_43_1","first-page":"241","article-title":"UER: An Open-Source Toolkit for Pre-training Models","volume":"2019","author":"Zhao Zhe","year":"2019","unstructured":"Zhe Zhao, Hui Chen, Jinbin Zhang, Xin Zhao, Tao Liu, Wei Lu, Xi Chen, Haotang Deng, Qi Ju, and Xiaoyong Du. 2019. UER: An Open-Source Toolkit for Pre-training Models. EMNLP-IJCNLP 2019 (2019), 241.","journal-title":"EMNLP-IJCNLP"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.11"}],"event":{"name":"CIKM '24: The 33rd ACM International Conference on Information and Knowledge Management","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"],"location":"Boise ID USA","acronym":"CIKM '24"},"container-title":["Proceedings of the 33rd ACM International Conference on Information and Knowledge Management"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3627673.3679835","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3627673.3679835","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:58:07Z","timestamp":1750294687000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3627673.3679835"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,21]]},"references-count":44,"alternative-id":["10.1145\/3627673.3679835","10.1145\/3627673"],"URL":"https:\/\/doi.org\/10.1145\/3627673.3679835","relation":{},"subject":[],"published":{"date-parts":[[2024,10,21]]},"assertion":[{"value":"2024-10-21","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}