{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,12]],"date-time":"2025-11-12T14:24:27Z","timestamp":1762957467367,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":96,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,8,24]],"date-time":"2024-08-24T00:00:00Z","timestamp":1724457600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"the Fundamental Research Funds for the Central Universities"},{"name":"the Zhejiang Province ?Leading Talent of Technological Innovation Program?","award":["No. 2023R5214"],"award-info":[{"award-number":["No. 2023R5214"]}]},{"name":"the Pioneer R&D Program of Zhejiang","award":["No.2024C01021"],"award-info":[{"award-number":["No.2024C01021"]}]},{"name":"the Major Research Program of Zhejiang Provincial Natural Science Foundation","award":["No.~LD24F020015"],"award-info":[{"award-number":["No.~LD24F020015"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,8,25]]},"DOI":"10.1145\/3637528.3671963","type":"proceedings-article","created":{"date-parts":[[2024,8,25]],"date-time":"2024-08-25T04:55:12Z","timestamp":1724561712000},"page":"4035-4046","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["BoKA: Bayesian Optimization based Knowledge Amalgamation for Multi-unknown-domain Text Classification"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1063-3990","authenticated-orcid":false,"given":"Linzhu","family":"Yu","sequence":"first","affiliation":[{"name":"The State Key Laboratory of Blockchain and Data Security, Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0084-1662","authenticated-orcid":false,"given":"Huan","family":"Li","sequence":"additional","affiliation":[{"name":"The State Key Laboratory of Blockchain and Data Security, Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3062-0900","authenticated-orcid":false,"given":"Ke","family":"Chen","sequence":"additional","affiliation":[{"name":"The State Key Laboratory of Blockchain and Data Security, Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8062-8356","authenticated-orcid":false,"given":"Lidan","family":"Shou","sequence":"additional","affiliation":[{"name":"The State Key Laboratory of Blockchain and Data Security, Zhejiang University, Hangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2024,8,24]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Himan Abdollahpouri Edward C Malthouse Joseph A Konstan Bamshad Mobasher and Jeremy Gilbert. 2021. Toward the next generation of news recommender systems. In WWW Companion. 402--406.","DOI":"10.1145\/3442442.3452327"},{"key":"e_1_3_2_1_2_1","unstructured":"Michael Carbin Abhinav Venigalla Jonathan Frankle. 2022. BioMedLM: a Domain-Specific Large Language Model for Biomedical Text. https:\/\/www.mosaicml.com\/blog\/introducing-pubmed-gpt"},{"key":"e_1_3_2_1_3_1","volume-title":"Publicly available clinical BERT embeddings. arXiv preprint arXiv:1904.03323","author":"Alsentzer Emily","year":"2019","unstructured":"Emily Alsentzer, John R Murphy, Willie Boag, Wei-Hung Weng, Di Jin, Tristan Naumann, and Matthew McDermott. 2019. Publicly available clinical BERT embeddings. arXiv preprint arXiv:1904.03323 (2019)."},{"key":"e_1_3_2_1_4_1","volume-title":"Finbert: Financial sentiment analysis with pre-trained language models. arXiv preprint arXiv:1908.10063","author":"Araci Dogu","year":"2019","unstructured":"Dogu Araci. 2019. Finbert: Financial sentiment analysis with pre-trained language models. arXiv preprint arXiv:1908.10063 (2019)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Yusuf Arslan Kevin Allix Lisa Veiber Cedric Lothritz Tegawend\u00e9 F Bissyand\u00e9 Jacques Klein and Anne Goujon. 2021. A comparison of pre-trained language models for multi-class text classification in the financial domain. In WWW Companion. 260--268.","DOI":"10.1145\/3442442.3451375"},{"key":"e_1_3_2_1_6_1","volume-title":"SciBERT: A pretrained language model for scientific text. arXiv preprint arXiv:1903.10676","author":"Beltagy Iz","year":"2019","unstructured":"Iz Beltagy, Kyle Lo, and Arman Cohan. 2019. SciBERT: A pretrained language model for scientific text. arXiv preprint arXiv:1903.10676 (2019)."},{"key":"e_1_3_2_1_7_1","first-page":"2546","article-title":"Algorithms for hyper-parameter optimization","volume":"24","author":"Bergstra James","year":"2011","unstructured":"James Bergstra, R\u00e9mi Bardenet, Yoshua Bengio, and Bal\u00e1zs K\u00e9gl. 2011. Algorithms for hyper-parameter optimization. NeurIPS, Vol. 24 (2011), 2546--2554.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_8_1","volume-title":"A tutorial on Bayesian optimization of expensive cost functions, with application to active user modeling and hierarchical reinforcement learning. arXiv preprint arXiv:1012.2599","author":"Brochu Eric","year":"2010","unstructured":"Eric Brochu, Vlad M Cora, and Nando De Freitas. 2010. A tutorial on Bayesian optimization of expensive cost functions, with application to active user modeling and hierarchical reinforcement learning. arXiv preprint arXiv:1012.2599 (2010)."},{"key":"e_1_3_2_1_9_1","volume-title":"A tutorial on Bayesian optimization of expensive cost functions, with application to active user modeling and hierarchical reinforcement learning. arXiv preprint arXiv:1012.2599","author":"Brochu Eric","year":"2010","unstructured":"Eric Brochu, Vlad M Cora, and Nando De Freitas. 2010. A tutorial on Bayesian optimization of expensive cost functions, with application to active user modeling and hierarchical reinforcement learning. arXiv preprint arXiv:1012.2599 (2010)."},{"key":"e_1_3_2_1_10_1","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al. 2020. Language models are few-shot learners. NeurIPS, Vol. 33 (2020), 1877--1901.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"Yevgen Chebotar and Austin Waters. 2016. Distilling knowledge from ensembles of neural networks for speech recognition. In Interspeech. 3439--3443.","DOI":"10.21437\/Interspeech.2016-1190"},{"key":"e_1_3_2_1_12_1","volume-title":"Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, et al.","author":"Chen Mark","year":"2021","unstructured":"Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Ponde de Oliveira Pinto, Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, et al. 2021. Evaluating large language models trained on code. arXiv preprint arXiv:2107.03374 (2021)."},{"key":"e_1_3_2_1_13_1","volume-title":"Angelika Romanou, Antoine Bonnet","author":"Chen Zeming","year":"2023","unstructured":"Zeming Chen, Alejandro Hern\u00e1ndez Cano, Angelika Romanou, Antoine Bonnet, Kyle Matoba, Francesco Salvi, Matteo Pagliardini, Simin Fan, Andreas K\u00f6pf, Amirkeivan Mohtashami, et al. 2023. Meditron-70b: Scaling medical pretraining for large language models. arXiv preprint arXiv:2311.16079 (2023)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","unstructured":"Kevin Clark Minh-Thang Luong Urvashi Khandelwal Christopher D Manning and Quoc Le. 2019. BAM! Born-Again Multi-Task Networks for Natural Language Understanding. In ACL. 5931--5937.","DOI":"10.18653\/v1\/P19-1595"},{"key":"e_1_3_2_1_15_1","volume-title":"BERT: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. BERT: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_16_1","volume-title":"Bayesian optimization for machine learning: A practical guidebook. arXiv preprint arXiv:1612.04858","author":"Dewancker Ian","year":"2016","unstructured":"Ian Dewancker, Michael McCourt, and Scott Clark. 2016. Bayesian optimization for machine learning: A practical guidebook. arXiv preprint arXiv:1612.04858 (2016)."},{"key":"e_1_3_2_1_17_1","first-page":"12345","article-title":"Agree to Disagree: Adaptive Ensemble Knowledge Distillation in Gradient Space","volume":"33","author":"Du Shangchen","year":"2020","unstructured":"Shangchen Du, Shan You, Xiaojie Li, Jianlong Wu, Fei Wang, Chen Qian, and Changshui Zhang. 2020. Agree to Disagree: Adaptive Ensemble Knowledge Distillation in Gradient Space. In NeurIPS, Vol. 33. 12345--12355.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_18_1","volume-title":"BOHB: Robust and efficient hyperparameter optimization at scale. In ICML. 1437--1446.","author":"Falkner Stefan","year":"2018","unstructured":"Stefan Falkner, Aaron Klein, and Frank Hutter. 2018. BOHB: Robust and efficient hyperparameter optimization at scale. In ICML. 1437--1446."},{"key":"e_1_3_2_1_19_1","first-page":"8","article-title":"A Tutorial on Bayesian Optimization","volume":"1050","author":"Frazier Peter I","year":"2018","unstructured":"Peter I Frazier. 2018. A Tutorial on Bayesian Optimization. STAT, Vol. 1050 (2018), 8.","journal-title":"STAT"},{"key":"e_1_3_2_1_20_1","volume-title":"A Visual Exploration of Gaussian Processes. Distill","author":"G\u00f6rtler Jochen","year":"2019","unstructured":"Jochen G\u00f6rtler, Rebecca Kehlbeck, and Oliver Deussen. 2019. A Visual Exploration of Gaussian Processes. Distill (2019)."},{"key":"e_1_3_2_1_21_1","volume-title":"MedAlpaca--An Open-Source Collection of Medical Conversational AI Models and Training Data. arXiv preprint arXiv:2304.08247","author":"Han Tianyu","year":"2023","unstructured":"Tianyu Han, Lisa C Adams, Jens-Michalis Papaioannou, Paul Grundmann, Tom Oberhauser, Alexander L\u00f6ser, Daniel Truhn, and Keno K Bressem. 2023. MedAlpaca--An Open-Source Collection of Medical Conversational AI Models and Training Data. arXiv preprint arXiv:2304.08247 (2023)."},{"key":"e_1_3_2_1_22_1","volume-title":"Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531","author":"Hinton Geoffrey","year":"2015","unstructured":"Geoffrey Hinton, Oriol Vinyals, and Jeff Dean. 2015. Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531 (2015)."},{"key":"e_1_3_2_1_23_1","volume-title":"Teacher-Student Architecture for Knowledge Learning: A Survey. arXiv preprint arXiv:2210.17332","author":"Hu Chengming","year":"2022","unstructured":"Chengming Hu, Xuan Li, Dan Liu, Xi Chen, Ju Wang, and Xue Liu. 2022. Teacher-Student Architecture for Knowledge Learning: A Survey. arXiv preprint arXiv:2210.17332 (2022)."},{"key":"e_1_3_2_1_24_1","volume-title":"Clinicalbert: Modeling clinical notes and predicting hospital readmission. arXiv preprint arXiv:1904.05342","author":"Huang Kexin","year":"2019","unstructured":"Kexin Huang, Jaan Altosaar, and Rajesh Ranganath. 2019. Clinicalbert: Modeling clinical notes and predicting hospital readmission. arXiv preprint arXiv:1904.05342 (2019)."},{"key":"e_1_3_2_1_25_1","volume-title":"AILA: A Question Answering System in the Legal Domain. In IJCAI. 5258--5260.","author":"Huang Weiyi","year":"2020","unstructured":"Weiyi Huang, Jiahao Jiang, Qiang Qu, and Min Yang. 2020. AILA: A Question Answering System in the Legal Domain. In IJCAI. 5258--5260."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Qiao Jin Bhuwan Dhingra Zhengping Liu William Cohen and Xinghua Lu. 2019. PubMedQA: A Dataset for Biomedical Research Question Answering. In EMNLP-IJCNLP. 2567--2577.","DOI":"10.18653\/v1\/D19-1259"},{"key":"e_1_3_2_1_27_1","volume-title":"Amalgamating Knowledge from Heterogeneous Graph Neural Networks. CVPR","author":"Jing Yongcheng","year":"2021","unstructured":"Yongcheng Jing, Yiding Yang, Xinchao Wang, Mingli Song, and Dacheng Tao. 2021. Amalgamating Knowledge from Heterogeneous Graph Neural Networks. CVPR (2021), 15704--15713."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1023\/A:1008306431147"},{"key":"e_1_3_2_1_29_1","volume-title":"Palmyra-med: Instruction-based fine-tuning of llms enhancing medical domain performance.","author":"Kamble Kiran","year":"2023","unstructured":"Kiran Kamble and Waseem Alshikh. 2023. Palmyra-med: Instruction-based fine-tuning of llms enhancing medical domain performance. (2023)."},{"key":"e_1_3_2_1_30_1","volume-title":"NeurIPS","volume":"31","author":"Kandasamy Kirthevasan","year":"2018","unstructured":"Kirthevasan Kandasamy, Willie Neiswanger, Jeff Schneider, Barnabas Poczos, and Eric P Xing. 2018. Neural architecture search with bayesian optimisation and optimal transport. NeurIPS, Vol. 31 (2018)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1111\/1467-9868.00294"},{"key":"e_1_3_2_1_32_1","volume-title":"Matthew S Gerber, and Laura E Barnes.","author":"Kowsari Kamran","year":"2017","unstructured":"Kamran Kowsari, Donald E Brown, Mojtaba Heidarysafa, Kiana Jafari Meimandi, Matthew S Gerber, and Laura E Barnes. 2017. Hdltex: Hierarchical deep learning for text classification. In ICMLA. IEEE, 364--371."},{"key":"e_1_3_2_1_33_1","volume-title":"NeurIPS","volume":"25","author":"Krizhevsky Alex","year":"2012","unstructured":"Alex Krizhevsky, Ilya Sutskever, and Geoffrey E Hinton. 2012. Imagenet classification with deep convolutional neural networks. NeurIPS, Vol. 25 (2012)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"crossref","unstructured":"Kisoo Kwon Hwidong Na Hoshik Lee and Nam Soo Kim. 2020. Adaptive Knowledge Distillation Based on Entropy. In ICASSP. 7409--7413.","DOI":"10.1109\/ICASSP40776.2020.9054698"},{"key":"e_1_3_2_1_35_1","volume-title":"Albert: A lite bert for self-supervised learning of language representations. arXiv preprint arXiv:1909.11942","author":"Lan Zhenzhong","year":"2019","unstructured":"Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, and Radu Soricut. 2019. Albert: A lite bert for self-supervised learning of language representations. arXiv preprint arXiv:1909.11942 (2019)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1093\/bioinformatics\/btz682"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Zhuoran Li Chunming Hu Xiaohui Guo Junfan Chen Wenyi Qin and Richong Zhang. 2022. An unsupervised multiple-task and multiple-teacher model for cross-lingual named entity recognition. In ACL. 170--179.","DOI":"10.18653\/v1\/2022.acl-long.14"},{"key":"e_1_3_2_1_38_1","volume-title":"Christoffer Egeberg Hother, and Ole Winther","author":"Li\u00e9vin Valentin","year":"2022","unstructured":"Valentin Li\u00e9vin, Christoffer Egeberg Hother, and Ole Winther. 2022. Can large language models reason about medical questions? arXiv preprint arXiv:2207.08143 (2022)."},{"key":"e_1_3_2_1_39_1","volume-title":"Ida Riis Jensen, and Ole Winther.","author":"Li\u00e9vin Valentin","year":"2023","unstructured":"Valentin Li\u00e9vin, Andreas Geert Motzfeldt, Ida Riis Jensen, and Ole Winther. 2023. Variational Open-Domain Question Answering. In ICML. 20950--20977."},{"key":"e_1_3_2_1_40_1","first-page":"2351","article-title":"Ensemble Distillation for Robust Model Fusion in Federated Learning","volume":"33","author":"Lin Tao","year":"2020","unstructured":"Tao Lin, Lingjing Kong, Sebastian U Stich, and Martin Jaggi. 2020. Ensemble Distillation for Robust Model Fusion in Federated Learning. In NeurIPS, Vol. 33. 2351--2363.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_41_1","volume-title":"Prod: Progressive distillation for dense retrieval. In WWW. 3299--3308.","author":"Lin Zhenghao","year":"2023","unstructured":"Zhenghao Lin, Yeyun Gong, Xiao Liu, Hang Zhang, Chen Lin, Anlei Dong, Jian Jiao, Jingwen Lu, Daxin Jiang, Rangan Majumder, et al. 2023. Prod: Progressive distillation for dense retrieval. In WWW. 3299--3308."},{"key":"e_1_3_2_1_42_1","volume-title":"Improving multi-task deep neural networks via knowledge distillation for natural language understanding. arXiv preprint arXiv:1904.09482","author":"Liu Xiaodong","year":"2019","unstructured":"Xiaodong Liu, Pengcheng He, Weizhu Chen, and Jianfeng Gao. 2019. Improving multi-task deep neural networks via knowledge distillation for natural language understanding. arXiv preprint arXiv:1904.09482 (2019)."},{"key":"e_1_3_2_1_43_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_44_1","unstructured":"Daniel James Lizotte. 2008. Practical bayesian optimization. (2008)."},{"key":"e_1_3_2_1_45_1","volume-title":"Hqadeephelper: A deep learning system for healthcare question answering. In WWW Companion. 194--197.","author":"Luo Feng","year":"2020","unstructured":"Feng Luo, Xiaoli Wang, Qingfeng Wu, Jiaying Liang, Xueliang Qiu, and Zhifeng Bao. 2020. Hqadeephelper: A deep learning system for healthcare question answering. In WWW Companion. 194--197."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1093\/bib\/bbac409"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"crossref","unstructured":"Sihui Luo Wenwen Pan Xinchao Wang Dazhou Wang Haihong Tang and Mingli Song. 2020. Collaboration by competition: Self-coordinated knowledge amalgamation for multi-talent student learning. In ECCV. 631--646.","DOI":"10.1007\/978-3-030-58539-6_38"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"crossref","unstructured":"Sihui Luo Xinchao Wang Gongfan Fang Yao Hu Dapeng Tao and Mingli Song. 2019. Knowledge amalgamation from heterogeneous networks by common feature learning. In AAAI. 3087--3093.","DOI":"10.24963\/ijcai.2019\/428"},{"key":"e_1_3_2_1_49_1","volume-title":"Biomedgpt: Open multimodal generative pre-trained transformer for biomedicine. arXiv preprint arXiv:2308.09442","author":"Luo Yizhen","year":"2023","unstructured":"Yizhen Luo, Jiahuan Zhang, Siqi Fan, Kai Yang, Yushuai Wu, Mu Qiao, and Zaiqing Nie. 2023. Biomedgpt: Open multimodal generative pre-trained transformer for biomedicine. arXiv preprint arXiv:2308.09442 (2023)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"crossref","unstructured":"Fatemehsadat Mireshghallah Archit Uniyal Tianhao Wang David K Evans and Taylor Berg-Kirkpatrick. 2022. An empirical analysis of memorization in fine-tuned autoregressive language models. In EMNLP. 1816--1826.","DOI":"10.18653\/v1\/2022.emnlp-main.119"},{"key":"e_1_3_2_1_51_1","first-page":"9361","article-title":"Bayesian optimization for iterative learning","volume":"33","author":"Nguyen Vu","year":"2020","unstructured":"Vu Nguyen, Sebastian Schulze, and Michael Osborne. 2020. Bayesian optimization for iterative learning. NeurIPS, Vol. 33 (2020), 9361--9371.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_52_1","unstructured":"The National Library of Medicine. 2024. MeSH: Medical Subject Headings. https:\/\/www.nlm.nih.gov\/mesh\/meshhome.html"},{"key":"e_1_3_2_1_53_1","unstructured":"OpenAI. 2022. ChatGPT. https:\/\/openai.com\/blog\/chatgpt"},{"key":"e_1_3_2_1_54_1","first-page":"27730","article-title":"Training language models to follow instructions with human feedback","volume":"35","author":"Ouyang Long","year":"2022","unstructured":"Long Ouyang, Jeffrey Wu, Xu Jiang, Diogo Almeida, Carroll Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, et al. 2022. Training language models to follow instructions with human feedback. NeurIPS, Vol. 35 (2022), 27730--27744.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_55_1","volume-title":"Logesh Kumar Umapathi, and Malaikannan Sankarasubbu","author":"Pal Ankit","year":"2022","unstructured":"Ankit Pal, Logesh Kumar Umapathi, and Malaikannan Sankarasubbu. 2022. Medmcqa: A large-scale multi-subject multi-choice dataset for medical domain question answering. In CHIL. 248--260."},{"key":"e_1_3_2_1_56_1","unstructured":"Alec Radford Karthik Narasimhan Tim Salimans Ilya Sutskever et al. 2018. Improving language understanding by generative pre-training. (2018)."},{"key":"e_1_3_2_1_57_1","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford Alec","year":"2019","unstructured":"Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei, Ilya Sutskever, et al. 2019. Language models are unsupervised multitask learners. OpenAI Blog, Vol. 1, 8 (2019), 9.","journal-title":"OpenAI Blog"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.5555\/3455716.3455856"},{"key":"e_1_3_2_1_59_1","volume-title":"Chen","author":"Ravaut Mathieu","year":"2022","unstructured":"Mathieu Ravaut, Shafiq R. Joty, and Nancy F. Chen. 2022. SummaReranker: A Multi-Task Mixture-of-Experts Re-ranking Framework for Abstractive Summarization. In ACL."},{"key":"e_1_3_2_1_60_1","first-page":"1814","article-title":"The PII problem: Privacy and a new concept of personally identifiable information","volume":"86","author":"Schwartz Paul M","year":"2011","unstructured":"Paul M Schwartz and Daniel J Solove. 2011. The PII problem: Privacy and a new concept of personally identifiable information. NYUL Rev., Vol. 86 (2011), 1814.","journal-title":"NYUL Rev."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2015.2494218"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"crossref","unstructured":"Chengchao Shen Xinchao Wang Jie Song Li Sun and Mingli Song. 2019. Amalgamating Knowledge towards Comprehensive Classification. In AAAI. 3068--3075.","DOI":"10.1609\/aaai.v33i01.33013068"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"crossref","unstructured":"Chengchao Shen Mengqi Xue Xinchao Wang Jie Song Li Sun and Mingli Song. 2019. Customizing student networks from heterogeneous teachers via adaptive knowledge amalgamation. In ICCV. 3504--3513.","DOI":"10.1109\/ICCV.2019.00360"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"crossref","unstructured":"Chengchao Shen Mengqi Xue Xinchao Wang Jie Song Li Sun and Mingli Song. 2019. Customizing Student Networks From Heterogeneous Teachers via Adaptive Knowledge Amalgamation. In ICCV. 3503--3512.","DOI":"10.1109\/ICCV.2019.00360"},{"key":"e_1_3_2_1_65_1","volume-title":"Nathan Scales, Ajay Tanwani, Heather Cole-Lewis, Stephen Pfohl, et al.","author":"Singhal Karan","year":"2023","unstructured":"Karan Singhal, Shekoofeh Azizi, Tao Tu, S Sara Mahdavi, Jason Wei, Hyung Won Chung, Nathan Scales, Ajay Tanwani, Heather Cole-Lewis, Stephen Pfohl, et al. 2023. Large language models encode clinical knowledge. Nature, Vol. 620, 7972 (2023), 172--180."},{"key":"e_1_3_2_1_66_1","unstructured":"Karan Singhal Tao Tu Juraj Gottweis Rory Sayres Ellery Wulczyn Le Hou Kevin Clark Stephen Pfohl Heather Cole-Lewis Darlene Neal et al. 2023. Towards expert-level medical question answering with large language models. arXiv preprint arXiv:2305.09617 (2023)."},{"key":"e_1_3_2_1_67_1","volume-title":"Adams","author":"Snoek Jasper","year":"2012","unstructured":"Jasper Snoek, Hugo Larochelle, and Ryan P. Adams. 2012. Practical Bayesian Optimization of Machine Learning Algorithms. In NeurIPS. 2960--2968."},{"key":"e_1_3_2_1_68_1","volume-title":"Feded: Federated learning via ensemble distillation for medical relation extraction. In EMNLP. 2118--2128.","author":"Sui Dianbo","year":"2020","unstructured":"Dianbo Sui, Yubo Chen, Jun Zhao, Yantao Jia, Yuantao Xie, and Weijian Sun. 2020. Feded: Federated learning via ensemble distillation for medical relation extraction. In EMNLP. 2118--2128."},{"volume-title":"How to fine-tune bert for text classification?","author":"Sun Chi","key":"e_1_3_2_1_69_1","unstructured":"Chi Sun, Xipeng Qiu, Yige Xu, and Xuanjing Huang. 2019. How to fine-tune bert for text classification?. In CCL. Springer, 194--206."},{"key":"e_1_3_2_1_70_1","volume-title":"NeurIPS","volume":"26","author":"Swersky Kevin","year":"2013","unstructured":"Kevin Swersky, Jasper Snoek, and Ryan P Adams. 2013. Multi-task bayesian optimization. NeurIPS, Vol. 26 (2013)."},{"key":"e_1_3_2_1_71_1","volume-title":"Alpaca: A strong, replicable instruction-following model., 7 pages.","author":"Taori Rohan","year":"2023","unstructured":"Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li, Carlos Guestrin, Percy Liang, and Tatsunori B Hashimoto. 2023. Alpaca: A strong, replicable instruction-following model., 7 pages."},{"key":"e_1_3_2_1_72_1","volume-title":"Galactica: A large language model for science. arXiv preprint arXiv:2211.09085","author":"Taylor Ross","year":"2022","unstructured":"Ross Taylor, Marcin Kardas, Guillem Cucurull, Thomas Scialom, Anthony Hartshorn, Elvis Saravia, Andrew Poulton, Viktor Kerkez, and Robert Stojnic. 2022. Galactica: A large language model for science. arXiv preprint arXiv:2211.09085 (2022)."},{"key":"e_1_3_2_1_73_1","volume-title":"Clinical Camel: An Open-Source Expert-Level Medical Language Model with Dialogue-Based Knowledge Encoding. arXiv preprint arXiv:2305.12031","author":"Toma Augustin","year":"2023","unstructured":"Augustin Toma, Patrick R Lawler, Jimmy Ba, Rahul G Krishnan, Barry B Rubin, and Bo Wang. 2023. Clinical Camel: An Open-Source Expert-Level Medical Language Model with Dialogue-Based Knowledge Encoding. arXiv preprint arXiv:2305.12031 (2023)."},{"key":"e_1_3_2_1_74_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_75_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_76_1","unstructured":"A Venigalla J Frankle and M Carbin. 2022. Pubmed gpt: A domain-specific large language model for biomedical text."},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"crossref","unstructured":"Devesh Walawalkar Zhiqiang Shen and Marios Savvides. 2020. Online Ensemble Model Compression Using Knowledge Distillation. In ECCV. 18--35.","DOI":"10.1007\/978-3-030-58529-7_2"},{"key":"e_1_3_2_1_78_1","volume-title":"Mulde: Multi-teacher knowledge distillation for low-dimensional knowledge graph embeddings. In WWW. 1716--1726.","author":"Wang Kai","year":"2021","unstructured":"Kai Wang, Yu Liu, Qian Ma, and Quan Z Sheng. 2021. Mulde: Multi-teacher knowledge distillation for low-dimensional knowledge graph embeddings. In WWW. 1716--1726."},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"crossref","unstructured":"Zihan Wang Peiyi Wang Lianzhe Huang Xin Sun and Houfeng Wang. 2022. Incorporating Hierarchy into Text Encoder: a Contrastive Learning Approach for Hierarchical Text Classification. In ACL. 7109--7119.","DOI":"10.18653\/v1\/2022.acl-long.491"},{"key":"e_1_3_2_1_80_1","volume-title":"PMC-LLaMA: Towards Building Open-source Language Models for Medicine. arXiv preprint arXiv:2305.10415","author":"Wu Chaoyi","year":"2023","unstructured":"Chaoyi Wu, Weixiong Lin, Xiaoman Zhang, Ya Zhang, Yanfeng Wang, and Weidi Xie. 2023. PMC-LLaMA: Towards Building Open-source Language Models for Medicine. arXiv preprint arXiv:2305.10415, Vol. 6 (2023)."},{"key":"e_1_3_2_1_81_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-acl.387"},{"key":"e_1_3_2_1_82_1","volume-title":"Unified and effective ensemble knowledge distillation. arXiv preprint arXiv:2204.00548","author":"Wu Chuhan","year":"2022","unstructured":"Chuhan Wu, Fangzhao Wu, Tao Qi, and Yongfeng Huang. 2022. Unified and effective ensemble knowledge distillation. arXiv preprint arXiv:2204.00548 (2022)."},{"key":"e_1_3_2_1_83_1","volume-title":"Pmc-llama: Further finetuning llama on medical papers. arXiv preprint arXiv:2304.14454","author":"Wu Chaoyi","year":"2023","unstructured":"Chaoyi Wu, Xiaoman Zhang, Ya Zhang, Yanfeng Wang, and Weidi Xie. 2023. Pmc-llama: Further finetuning llama on medical papers. arXiv preprint arXiv:2304.14454 (2023)."},{"key":"e_1_3_2_1_84_1","volume-title":"Federated Selective Aggregation for Knowledge Amalgamation. arXiv preprint arXiv:2207.13309","author":"Xie Donglin","year":"2022","unstructured":"Donglin Xie, Ruonan Yu, Gongfan Fang, Jie Song, Zunlei Feng, Xinchao Wang, Li Sun, and Mingli Song. 2022. Federated Selective Aggregation for Knowledge Amalgamation. arXiv preprint arXiv:2207.13309 (2022)."},{"key":"e_1_3_2_1_85_1","volume-title":"Xlnet: Generalized autoregressive pretraining for language understanding. Advances in neural information processing systems","author":"Yang Zhilin","year":"2019","unstructured":"Zhilin Yang, Zihang Dai, Yiming Yang, Jaime Carbonell, Russ R Salakhutdinov, and Quoc V Le. 2019. Xlnet: Generalized autoregressive pretraining for language understanding. Advances in neural information processing systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_86_1","doi-asserted-by":"crossref","unstructured":"Ze Yang Linjun Shou Ming Gong Wutao Lin and Daxin Jiang. 2020. Model compression with two-stage multi-teacher knowledge distillation for web question answering system. In WSDM. 690--698.","DOI":"10.1145\/3336191.3371792"},{"key":"e_1_3_2_1_87_1","first-page":"37309","article-title":"Deep bidirectional language-knowledge graph pretraining","volume":"35","author":"Yasunaga Michihiro","year":"2022","unstructured":"Michihiro Yasunaga, Antoine Bosselut, Hongyu Ren, Xikun Zhang, Christopher D Manning, Percy S Liang, and Jure Leskovec. 2022. Deep bidirectional language-knowledge graph pretraining. NeurIPS, Vol. 35 (2022), 37309--37323.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_88_1","doi-asserted-by":"crossref","unstructured":"Michihiro Yasunaga Jure Leskovec and Percy Liang. 2022. LinkBERT: Pretraining Language Models with Document Links. In ACL. 8003--8016.","DOI":"10.18653\/v1\/2022.acl-long.551"},{"key":"e_1_3_2_1_89_1","unstructured":"Jingwen Ye Yixin Ji Xinchao Wang Kairi Ou Dapeng Tao and Mingli Song. 2019. Student Becoming the Master: Knowledge Amalgamation for Joint Scene Parsing Depth Estimation and More. In CVPR. 2829--2838."},{"key":"e_1_3_2_1_90_1","unstructured":"Jingwen Ye Yixin Ji Xinchao Wang Kairi Ou Dapeng Tao and Mingli Song. 2019. Student Becoming the Master: Knowledge Amalgamation for Joint Scene Parsing Depth Estimation and More. In CVPR. 2829--2838."},{"key":"e_1_3_2_1_91_1","doi-asserted-by":"crossref","unstructured":"Shan You Chang Xu Chao Xu and Dacheng Tao. 2017. Learning from Multiple Teacher Networks. In KDD. 1285--1294.","DOI":"10.1145\/3097983.3098135"},{"key":"e_1_3_2_1_92_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i16.17680"},{"key":"e_1_3_2_1_93_1","volume-title":"Chatdoctor: A medical chat model fine-tuned on llama model using medical domain knowledge. arXiv preprint arXiv:2303.14070","author":"Yunxiang Li","year":"2023","unstructured":"Li Yunxiang, Li Zihan, Zhang Kai, Dan Ruilong, and Zhang You. 2023. Chatdoctor: A medical chat model fine-tuned on llama model using medical domain knowledge. arXiv preprint arXiv:2303.14070 (2023)."},{"key":"e_1_3_2_1_94_1","volume-title":"Towards automated deep learning: Efficient joint neural architecture and hyperparameter search. arXiv preprint arXiv:1807.06906","author":"Zela Arber","year":"2018","unstructured":"Arber Zela, Aaron Klein, Stefan Falkner, and Frank Hutter. 2018. Towards automated deep learning: Efficient joint neural architecture and hyperparameter search. arXiv preprint arXiv:1807.06906 (2018)."},{"key":"e_1_3_2_1_95_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3263105"},{"key":"e_1_3_2_1_96_1","doi-asserted-by":"crossref","unstructured":"Michael Zimmer. 2020. ?But the data is already public\": on the ethics of research in Facebook. In The ethics of information technologies. 229--241.","DOI":"10.4324\/9781003075011-17"}],"event":{"name":"KDD '24: The 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining","sponsor":["SIGMOD ACM Special Interest Group on Management of Data","SIGKDD ACM Special Interest Group on Knowledge Discovery in Data"],"location":"Barcelona Spain","acronym":"KDD '24"},"container-title":["Proceedings of the 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3637528.3671963","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3637528.3671963","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:06:05Z","timestamp":1750291565000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3637528.3671963"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,24]]},"references-count":96,"alternative-id":["10.1145\/3637528.3671963","10.1145\/3637528"],"URL":"https:\/\/doi.org\/10.1145\/3637528.3671963","relation":{},"subject":[],"published":{"date-parts":[[2024,8,24]]},"assertion":[{"value":"2024-08-24","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}