{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,4]],"date-time":"2026-02-04T18:13:30Z","timestamp":1770228810223,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":61,"publisher":"ACM","funder":[{"name":"NSFC","award":["U23A20468"],"award-info":[{"award-number":["U23A20468"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,8,3]]},"DOI":"10.1145\/3711896.3737202","type":"proceedings-article","created":{"date-parts":[[2025,8,3]],"date-time":"2025-08-03T21:04:26Z","timestamp":1754255066000},"page":"5106-5116","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["ChatPD: An LLM-driven Paper-Dataset Networking System"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-4766-0797","authenticated-orcid":false,"given":"Anjie","family":"Xu","sequence":"first","affiliation":[{"name":"Key Lab of High Confidence Software Technologies (Peking University), Ministry of Education, Beijing, China and School of Computer Science, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2273-8260","authenticated-orcid":false,"given":"Ruiqing","family":"Ding","sequence":"additional","affiliation":[{"name":"Key Laboratory of Process Optimization and Intelligent Decision-making, Ministry of Education, Hefei University of Technology, Hefei, China and School of Management, Hefei University of Technology, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7627-8485","authenticated-orcid":false,"given":"Leye","family":"Wang","sequence":"additional","affiliation":[{"name":"Key Lab of High Confidence Software Technologies (Peking University), Ministry of Education, Beijing, China and School of Computer Science, Peking University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,8,3]]},"reference":[{"key":"e_1_3_2_2_1_1","unstructured":"2016. Diginetica dataset for CIKM Cup 2016 challenge. https:\/\/competitions.codalab.org\/competitions\/11161."},{"key":"e_1_3_2_2_2_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","unstructured":"arXiv.org submitters. 2024. arXiv Dataset. doi:10.34740\/KAGGLE\/DSV\/7548853","DOI":"10.34740\/KAGGLE\/DSV\/7548853"},{"key":"e_1_3_2_2_4_1","volume-title":"Maschenka Balkenhol, Meyke Hermsen, Babak Ehteshami Bejnordi, Byungjae Lee, Kyunghyun Paeng, Aoxiao Zhong, et al.","author":"Bandi Peter","year":"2018","unstructured":"Peter Bandi, Oscar Geessink, Quirine Manson, Marcory Van Dijk, Maschenka Balkenhol, Meyke Hermsen, Babak Ehteshami Bejnordi, Byungjae Lee, Kyunghyun Paeng, Aoxiao Zhong, et al. 2018. From detection of individual metastases to classification of lymph node status at the patient level: the CAMELYON17 challenge. IEEE Transactions on Medical Imaging (2018)."},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1609\/icwsm.v3i1.13937"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/2792838.2798723"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3308558.3313685"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00778-019-00564-x"},{"key":"e_1_3_2_2_9_1","volume-title":"Thien Huu Nguyen, and Yoshua Bengio","author":"Chevalier-Boisvert Maxime","year":"2018","unstructured":"Maxime Chevalier-Boisvert, Dzmitry Bahdanau, Salem Lahlou, Lucas Willems, Chitwan Saharia, Thien Huu Nguyen, and Yoshua Bengio. 2018. BabyAI: A platform to study the sample efficiency of grounded language learning. Preprint arXiv:1810.08272 (2018)."},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/352595.352598"},{"key":"e_1_3_2_2_11_1","unstructured":"Together Computer. 2023. RedPajama: An Open Source Recipe to Reproduce LLaMA training dataset. https:\/\/github.com\/togethercomputer\/RedPajama-Data"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.350"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/234173.234209"},{"key":"e_1_3_2_2_14_1","unstructured":"Ganqu Cui Lifan Yuan Ning Ding Guanming Yao Wei Zhu Yuan Ni Guotong Xie Zhiyuan Liu and Maosong Sun. 2023. UltraFeedback: Boosting Language Models with High-quality Feedback. arXiv:2310.01377 [cs.CL]"},{"key":"e_1_3_2_2_15_1","unstructured":"Will Cukierski. 2012. Titanic - Machine Learning from Disaster. https:\/\/kaggle.com\/competitions\/titanic"},{"key":"e_1_3_2_2_16_1","unstructured":"FICO. 2018. Fico xml challenge. https:\/\/community.fico.com\/s\/explainablemachine-learning-challenge"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/276675.276685"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1093\/jamia\/ocw042"},{"key":"e_1_3_2_2_19_1","first-page":"2020","article-title":"Lost or Found? Discovering Data Needed for Research","volume":"4","author":"Gregory Kathleen","year":"2020","unstructured":"Kathleen Gregory, Paul Groth, Andrea Scharnhorst, and Sally Wyatt. 2020. Lost or Found? Discovering Data Needed for Research. Harvard Data Science Review, 4 2020.","journal-title":"Harvard Data Science Review"},{"key":"e_1_3_2_2_20_1","volume-title":"Newsroom: A dataset of 1.3 million summaries with diverse extractive strategies. arXiv preprint arXiv:1804.11283","author":"Grusky Max","year":"2018","unstructured":"Max Grusky, Mor Naaman, and Yoav Artzi. 2018. Newsroom: A dataset of 1.3 million summaries with diverse extractive strategies. arXiv preprint arXiv:1804.11283 (2018)."},{"key":"e_1_3_2_2_21_1","volume-title":"Advances in Neural Information Processing Systems","author":"Hao Qianyue","year":"2024","unstructured":"Qianyue Hao, Jingyang Fan, Fengli Xu, Jian Yuan, and Yong Li. 2024. HLM-Cite: Hybrid Language Model Workflow for Text-based Scientific Citation Prediction. In Advances in Neural Information Processing Systems, A. Globerson, L. Mackey, D. Belgrave, A. Fan, U. Paquet, J. Tomczak, and C. Zhang (Eds.), Vol. 37. Curran Associates, Inc., 48189-48223. https:\/\/proceedings.neurips.cc\/paper_files\/paper \/2024\/file\/5635925cf9d2274f338eb0dd5971e845-Paper-Conference.pdf"},{"key":"e_1_3_2_2_22_1","volume-title":"Triviaqa: A large scale distantly supervised challenge dataset for reading comprehension. arXiv preprint arXiv:1705.03551","author":"Joshi Mandar","year":"2017","unstructured":"Mandar Joshi, Eunsol Choi, Daniel S Weld, and Luke Zettlemoyer. 2017. Triviaqa: A large scale distantly supervised challenge dataset for reading comprehension. arXiv preprint arXiv:1705.03551 (2017)."},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00453"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24592-8_15"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ijhcs.2019.10.004"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3025453.3025838"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00276"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/2487575.2487592"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.474"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539247"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.598"},{"key":"e_1_3_2_2_32_1","first-page":"740","volume-title":"Zurich","author":"Lin Tsung-Yi","year":"2014","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2014. Microsoft coco: Common objects in context. In Computer Vision-ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13. Springer, 740-755."},{"key":"e_1_3_2_2_33_1","volume-title":"Mo Yu, Bing Xiang, Bowen Zhou, and Yoshua Bengio.","author":"Lin Zhouhan","year":"2017","unstructured":"Zhouhan Lin, Minwei Feng, Cicero Nogueira dos Santos, Mo Yu, Bing Xiang, Bowen Zhou, and Yoshua Bengio. 2017. A structured self-attentive sentence embedding. arXiv preprint arXiv:1703.03130 (2017)."},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.139"},{"key":"e_1_3_2_2_35_1","unstructured":"Aixin Liu Bei Feng Bing Xue BingxuanWang BochaoWu Chengda Lu Chenggang Zhao Chengqi Deng Chenyu Zhang Chong Ruan et al. 2024. Deepseek-v3 technical report. arXiv preprint arXiv:2412.19437 (2024)."},{"key":"e_1_3_2_2_36_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 364","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 364 (2019)."},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.425"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-acl.124"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-021-00339-6"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3605943"},{"key":"e_1_3_2_2_41_1","first-page":"267","article-title":"The field matching problem: algorithms and applications","volume":"2","author":"Monge Alvaro E","year":"1996","unstructured":"Alvaro E Monge, Charles Elkan, et al. 1996. The field matching problem: algorithms and applications. In Kdd, Vol. 2. 267-270.","journal-title":"Kdd"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10822-014-9776-5"},{"key":"e_1_3_2_2_43_1","unstructured":"Tri Nguyen Mir Rosenberg Xia Song Jianfeng Gao Saurabh Tiwary Rangan Majumder and Li Deng. 2016. Ms marco: A human-generated machine reading comprehension dataset. (2016)."},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.5815\/ijieeb.2012.01.07"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1016\/S0167-7152(96)00140-X"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00592"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W19-2604"},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00886"},{"key":"e_1_3_2_2_49_1","volume-title":"100,000 questions for machine comprehension of text. arXiv preprint arXiv:1606.05250","author":"Rajpurkar Pranav","year":"2016","unstructured":"Pranav Rajpurkar, Jian Zhang, Konstantin Lopyrev, and Percy Liang. 2016. Squad: 100,000 questions for machine comprehension of text. arXiv preprint arXiv:1606.05250 (2016)."},{"key":"e_1_3_2_2_50_1","volume-title":"A hierarchical graphical model for record linkage. arXiv preprint arXiv:1207.4180","author":"Ravikumar Pradeep","year":"2012","unstructured":"Pradeep Ravikumar and William Cohen. 2012. A hierarchical graphical model for record linkage. arXiv preprint arXiv:1207.4180 (2012)."},{"key":"e_1_3_2_2_51_1","volume-title":"Collective classification in network data. AI magazine 29, 3","author":"Sen Prithviraj","year":"2008","unstructured":"Prithviraj Sen, Galileo Namata, Mustafa Bilgic, Lise Getoor, Brian Galligher, and Tina Eliassi-Rad. 2008. Collective classification in network data. AI magazine 29, 3 (2008), 93-93."},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11786-010-0024-7"},{"key":"e_1_3_2_2_53_1","unstructured":"Qwen Team. 2024. Qwen2.5: A Party of Foundation Models. https:\/\/qwenlm.github.io\/blog\/qwen2.5\/"},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10115-007-0094-2"},{"key":"e_1_3_2_2_55_1","volume-title":"Newsqa: A machine comprehension dataset. arXiv preprint arXiv:1611.09830","author":"Trischler Adam","year":"2016","unstructured":"Adam Trischler, Tong Wang, Xingdi Yuan, Justin Harris, Alessandro Sordoni, Philip Bachman, and Kaheer Suleman. 2016. Newsqa: A machine comprehension dataset. arXiv preprint arXiv:1611.09830 (2016)."},{"key":"e_1_3_2_2_56_1","volume-title":"GLUE: A multi-task benchmark and analysis platform for natural language understanding. arXiv preprint arXiv:1804.07461","author":"Singh Amanpreet","year":"2018","unstructured":"AlexWang, Amanpreet Singh, Julian Michael, Felix Hill, Omer Levy, and Samuel R Bowman. 2018. GLUE: A multi-task benchmark and analysis platform for natural language understanding. arXiv preprint arXiv:1804.07461 (2018)."},{"key":"e_1_3_2_2_57_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.335"},{"key":"e_1_3_2_2_58_1","volume-title":"Brian Lester, Nan Du, Andrew M Dai, and Quoc V Le.","author":"Wei Jason","year":"2021","unstructured":"Jason Wei, Maarten Bosma, Vincent Y Zhao, Kelvin Guu, Adams Wei Yu, Brian Lester, Nan Du, Andrew M Dai, and Quoc V Le. 2021. Finetuned language models are zero-shot learners. arXiv preprint arXiv:2109.01652 (2021)."},{"key":"e_1_3_2_2_59_1","doi-asserted-by":"publisher","DOI":"10.1145\/3318464.3389743"},{"key":"e_1_3_2_2_60_1","volume-title":"Fashion-mnist: a novel image dataset for benchmarking machine learning algorithms. arXiv preprint arXiv:1708.07747","author":"Xiao Han","year":"2017","unstructured":"Han Xiao, Kashif Rasul, and Roland Vollgraf. 2017. Fashion-mnist: a novel image dataset for benchmarking machine learning algorithms. arXiv preprint arXiv:1708.07747 (2017)."},{"key":"e_1_3_2_2_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606800"}],"event":{"name":"KDD '25: The 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining","location":"Toronto ON Canada","acronym":"KDD '25","sponsor":["SIGMOD ACM Special Interest Group on Management of Data","SIGKDD ACM Special Interest Group on Knowledge Discovery in Data"]},"container-title":["Proceedings of the 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining V.2"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3711896.3737202","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,16]],"date-time":"2025-08-16T14:43:55Z","timestamp":1755355435000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3711896.3737202"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,3]]},"references-count":61,"alternative-id":["10.1145\/3711896.3737202","10.1145\/3711896"],"URL":"https:\/\/doi.org\/10.1145\/3711896.3737202","relation":{},"subject":[],"published":{"date-parts":[[2025,8,3]]},"assertion":[{"value":"2025-08-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}