{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:32:33Z","timestamp":1763191953978,"version":"3.45.0"},"reference-count":41,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1109\/ijcnn64981.2025.11228563","type":"proceedings-article","created":{"date-parts":[[2025,11,14]],"date-time":"2025-11-14T18:46:15Z","timestamp":1763145975000},"page":"1-8","source":"Crossref","is-referenced-by-count":0,"title":["A Low-Noise Web Content Extraction Framework for LLM Data Pipelines: Integrating XGBoost Classification and Reverse Coloring"],"prefix":"10.1109","author":[{"given":"Bin","family":"Wang","sequence":"first","affiliation":[{"name":"Peking University,Guangdong Provincial Key Laboratory of Ultra High Definition Immersive Media Technology, Shenzhen Graduate School"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ziyan","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lexi","family":"Chen","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hui","family":"Li","sequence":"additional","affiliation":[{"name":"Peking University,Guangdong Provincial Key Laboratory of Ultra High Definition Immersive Media Technology, Shenzhen Graduate School"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1007\/s10586-018-2366-x"},{"article-title":"Fact or fiction: Content classification for digital libraries","volume-title":"DELOS","author":"Finn","key":"ref2"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-short.72"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1145\/3696410.3714546"},{"journal-title":"Understanding the effect of noise in llm training data with algorithmic chains of thought","year":"2024","author":"Havrilla","key":"ref5"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1145\/2009916.2009952"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/1718487.1718542"},{"journal-title":"Chinesewebtext: Large-scale high-quality chinese web text extracted with effective evaluation model","year":"2023","author":"Chen","key":"ref8"},{"key":"ref9","first-page":"1","article-title":"Content extraction from html documents","volume-title":"1st Int. Workshop on Web Document Analysis (WDA2001)","author":"Rahman"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICCAE.2010.5451952"},{"key":"ref11","first-page":"155","article-title":"Web page cleaning with conditional random fields","volume":"4","author":"Mared","year":"2007","journal-title":"Calriers du Central"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1145\/2068816.2068846"},{"issue":"2","key":"ref13","first-page":"27","article-title":"Eliminating noisy information in web pages using featured dom tree","volume":"2","author":"Das","year":"2012","journal-title":"International Journal of Applied Information Systems"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1145\/775152.775182"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICCICT.2015.7045706"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1145\/2939672.2939785"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3536321"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/956750.956785"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICISA.2013.6579445"},{"key":"ref20","first-page":"109","article-title":"Roadrunner: Towards automatic data extraction from large web sites","volume-title":"VLDB","volume":"1","author":"Crescenzi"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/565117.565137"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1016\/S0004-3702(99)00100-9"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1145\/2487788.2487828"},{"issue":"1","key":"ref24","article-title":"Extraction of data from web pages: A vision based approach","volume":"3","author":"Hiremath","year":"2009","journal-title":"International Journal of Computer and Information Science and Engineering"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2009.109"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-demo.15"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.32614\/cran.package.xgboost"},{"journal-title":"Sweb: A large web dataset for the scandinavian languages","year":"2024","author":"Norlund","key":"ref28"},{"journal-title":"The fineweb datasets: Decanting the web for the finest text data at scale","year":"2024","author":"Penedo","key":"ref29"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.5555\/3294996.3295070"},{"key":"ref31","article-title":"Catboost: unbiased boosting with categorical features","volume":"31","author":"Prokhorenkova","year":"2018","journal-title":"Advances in neural information processing systems"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.29172\/7c2a6982-6d72-4cd8-bba6-2fccb06a7011"},{"issue":"1","key":"ref33","first-page":"189","article-title":"Introduction to artificial neural network","volume":"2","author":"Dongare","year":"2012","journal-title":"International Journal of Engineering and Innovative Technology (IJEIT)"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1016\/S0167-9473(01)00065-2"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-41136-6_5"},{"article-title":"readability-lxml","year":"2023","author":"Baburov","key":"ref36"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ITCS.2010.76"},{"article-title":"goose3","year":"2024","author":"Lababidi","key":"ref38"},{"year":"2024","key":"ref39","article-title":"justtext"},{"article-title":"Newspaper3k","year":"2024","author":"Ou-Yang","key":"ref40"},{"year":"2022","key":"ref41","article-title":"Chatgpt api documentation"}],"event":{"name":"2025 International Joint Conference on Neural Networks (IJCNN)","start":{"date-parts":[[2025,6,30]]},"location":"Rome, Italy","end":{"date-parts":[[2025,7,5]]}},"container-title":["2025 International Joint Conference on Neural Networks (IJCNN)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11227166\/11227148\/11228563.pdf?arnumber=11228563","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:27:37Z","timestamp":1763191657000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11228563\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":41,"URL":"https:\/\/doi.org\/10.1109\/ijcnn64981.2025.11228563","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]}}}