{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T13:40:06Z","timestamp":1755870006684,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":11,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100006374","name":"Bpifrance","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,7,13]]},"DOI":"10.1145\/3726302.3730234","type":"proceedings-article","created":{"date-parts":[[2025,7,14]],"date-time":"2025-07-14T01:41:01Z","timestamp":1752457261000},"page":"2853-2857","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Multilingual Evaluation of Main Content Extractors for Web Pages"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-1351-5837","authenticated-orcid":false,"given":"Aur\u00e9lien","family":"Bournonville","sequence":"first","affiliation":[{"name":"University of Caen Normandy, Caen, France and Babbar.tech, Petit-Quevilly, France"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5840-1603","authenticated-orcid":false,"given":"Ga\u00ebl","family":"Dias","sequence":"additional","affiliation":[{"name":"University of Caen Normandy, Caen, France"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-8554-4754","authenticated-orcid":false,"given":"Thomas","family":"Largillier","sequence":"additional","affiliation":[{"name":"Babbar.tech, Petit-Quevilly, France"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-6984-0291","authenticated-orcid":false,"given":"Emmanuel","family":"Marchand","sequence":"additional","affiliation":[{"name":"Babbar.tech, Petit-Quevilly, France"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8644-2461","authenticated-orcid":false,"given":"Fabrice","family":"Maurel","sequence":"additional","affiliation":[{"name":"University of Caen Normandy, Caen, France"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6172-4342","authenticated-orcid":false,"given":"Guillaume","family":"Pitel","sequence":"additional","affiliation":[{"name":"Babbar.tech, Petit-Quevilly, France"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8162-0997","authenticated-orcid":false,"given":"Fran\u00e7ois","family":"Rioult","sequence":"additional","affiliation":[{"name":"University of Caen Normandy, Caen, France"}]}],"member":"320","published-online":{"date-parts":[[2025,7,13]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-demo.15"},{"key":"e_1_3_2_1_2_1","first-page":"5","volume-title":"Language Resources and Evaluation Conference (LREC","author":"Barbaresi Adrien","year":"2020","unstructured":"Adrien Barbaresi and Ga\u00ebl Lejeune. 2020. Out-of-the-box and into the ditch? Multilingual evaluation of generic text extraction tools. In Language Resources and Evaluation Conference (LREC 2020). 5-13."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3539618.3591920"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/1718487.1718542"},{"key":"e_1_3_2_1_5_1","volume-title":"Soviet Physics-Doklady","volume":"10","author":"Lcvenshtcin V. I.","year":"1966","unstructured":"V. I. Lcvenshtcin. 1966. Binary codes capable of correcting deletions, insertions, and reversals. In Soviet Physics-Doklady, Vol. 10. Issue 8."},{"volume-title":"Advances in Natural Language Processing.","author":"Lejeune Ga\u00ebl","key":"e_1_3_2_1_6_1","unstructured":"Ga\u00ebl Lejeune, Romain Brixtel, Antoine Doucet, and Nadine Lucas. 2012. DAnIEL: Language Independent Character-Based News Surveillance. In Advances in Natural Language Processing. Vol. 7614. Springer Berlin Heidelberg, 64-75. Lecture Notes in Computer Science."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","unstructured":"Jurek Leonhardt Avishek Anand and Megha Khosla. 2020. Boilerplate Removal using a Neural Sequence Labeling Model. https:\/\/arxiv.org\/pdf\/2004.14294","DOI":"10.1145\/3366424.3383547"},{"key":"e_1_3_2_1_8_1","first-page":"74","article-title":"Rouge: A package for automatic evaluation of summaries","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. Rouge: A package for automatic evaluation of summaries. In Text Summarization Branches Out. 74-81.","journal-title":"Text Summarization Branches Out."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1098\/rspl.1895.0041"},{"key":"e_1_3_2_1_10_1","volume-title":"Masarykova univerzita, Fakulta informatiky","author":"Pomik\u00e1lek Jan","year":"2011","unstructured":"Jan Pomik\u00e1lek. 2011. Removing boilerplate and duplicate content from web corpora. Disertacn? pr\u00e1ce, Masarykova univerzita, Fakulta informatiky (2011)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1093\/ije\/dyq191"}],"event":{"name":"SIGIR '25: The 48th International ACM SIGIR Conference on Research and Development in Information Retrieval","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"],"location":"Padua Italy","acronym":"SIGIR '25"},"container-title":["Proceedings of the 48th International ACM SIGIR Conference on Research and Development in Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3726302.3730234","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T13:21:08Z","timestamp":1755868868000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3726302.3730234"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,13]]},"references-count":11,"alternative-id":["10.1145\/3726302.3730234","10.1145\/3726302"],"URL":"https:\/\/doi.org\/10.1145\/3726302.3730234","relation":{},"subject":[],"published":{"date-parts":[[2025,7,13]]},"assertion":[{"value":"2025-07-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}