{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,12]],"date-time":"2026-03-12T12:16:43Z","timestamp":1773317803287,"version":"3.50.1"},"reference-count":31,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2017,6]]},"DOI":"10.1109\/jcdl.2017.7991564","type":"proceedings-article","created":{"date-parts":[[2017,7,27]],"date-time":"2017-07-27T20:40:17Z","timestamp":1501188017000},"page":"1-10","source":"Crossref","is-referenced-by-count":36,"title":["A Benchmark and Evaluation for Text Extraction from PDF"],"prefix":"10.1109","author":[{"given":"Hannah","family":"Bast","sequence":"first","affiliation":[]},{"given":"Claudius","family":"Korzen","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref31","author":"ward","year":"2015","journal-title":"pdf-extract"},{"key":"ref30","author":"national","year":"2017","journal-title":"Institutes of Health's National Library of Medicine PubMed Central"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1145\/2756406.2756946"},{"key":"ref11","author":"dejean","year":"2016","journal-title":"pdftoxml"},{"key":"ref12","year":"2014","journal-title":"FooLabs Xpdf A PDF Viewer for X"},{"key":"ref13","author":"hassan","year":"2013"},{"key":"ref14","year":"2016","journal-title":"Institute of Computer Science and Technology of Peking University Marmot Datasets"},{"key":"ref15","author":"kan","year":"2016","journal-title":"ParsCit"},{"key":"ref16","author":"klampfl","year":"2014","journal-title":"Unsupervised Document Structure Analysis of Digital ScientificArticles JCDL"},{"key":"ref17","author":"korzen","year":"2017","journal-title":"ICECIT"},{"key":"ref18","author":"kruk","year":"2013","journal-title":"pdftohtml"},{"key":"ref19","author":"ley","year":"2009","journal-title":"DBLP Some Lessons Learned"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1045\/november14-tkaczyk"},{"key":"ref4","author":"bird","year":"0","journal-title":"The ACL Anthology Reference Corpus A Reference Dataset for Bibliographic Researchin Computational Linguistics In LREC 2008"},{"key":"ref27","author":"tkaczyk","year":"0","journal-title":"GROTOAP Ground Truth for Open Access Publications In JCDL 2012"},{"key":"ref3","year":"2011","journal-title":"R Berg PDFExtract"},{"key":"ref6","article-title":"CiteSeerX: A Scholarly Big Dataset","author":"caragea","year":"0","journal-title":"ECIR 2014"},{"key":"ref29","article-title":"CERMINE - Automatic Extraction of Metadata and References from ScientificLiterature","author":"tkaczyk","year":"2014","journal-title":"DAS"},{"key":"ref5","author":"burns","year":"2013","journal-title":"LA-Pdf Text"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/2494266.2494271"},{"key":"ref7","author":"constantin","year":"2011","journal-title":"pdfx"},{"key":"ref2","year":"2017","journal-title":"Apache PDFBox"},{"key":"ref9","year":"2017","journal-title":"Cornell University arXiv org e-Print archive"},{"key":"ref1","article-title":"A New Dataset for Fine-Grained Citation Field Extraction","author":"anzaroot","year":"0","journal-title":"ICML Workshop (PEER) 2013"},{"key":"ref20","article-title":"Evaluation of Header Metadata Extraction Approaches and Tools for ScientificPDF Documents","author":"lipinski","year":"0","journal-title":"JCDL 2013"},{"key":"ref22","article-title":"Automating the Construction of Internet Portals with Machine Learning","author":"mccallum","year":"0","journal-title":"Inf Retr 2000"},{"key":"ref21","author":"lopez","year":"2017","journal-title":"Grobid"},{"key":"ref24","author":"shinyama","year":"2016"},{"key":"ref23","article-title":"Layout-Aware Text Extraction from Full-Text PDF of ScientificArticles","author":"ramakrishnan","year":"0","journal-title":"Source Code for Biology and Medicine 2012"},{"key":"ref26","author":"tiedemann","year":"2016","journal-title":"pdf2xml"},{"key":"ref25","year":"2017","journal-title":"Springer Nature BioMed Central"}],"event":{"name":"2017 ACM\/IEEE Joint Conference on Digital Libraries (JCDL)","location":"Toronto, ON, Canada","start":{"date-parts":[[2017,6,19]]},"end":{"date-parts":[[2017,6,23]]}},"container-title":["2017 ACM\/IEEE Joint Conference on Digital Libraries (JCDL)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/7988694\/7990621\/07991564.pdf?arnumber=7991564","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2017,8,23]],"date-time":"2017-08-23T23:41:05Z","timestamp":1503531665000},"score":1,"resource":{"primary":{"URL":"http:\/\/ieeexplore.ieee.org\/document\/7991564\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017,6]]},"references-count":31,"URL":"https:\/\/doi.org\/10.1109\/jcdl.2017.7991564","relation":{},"subject":[],"published":{"date-parts":[[2017,6]]}}}