{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,25]],"date-time":"2025-03-25T14:39:34Z","timestamp":1742913574399,"version":"3.40.3"},"publisher-location":"London","reference-count":48,"publisher":"Springer London","isbn-type":[{"type":"print","value":"9780857298584"},{"type":"electronic","value":"9780857298591"}],"license":[{"start":{"date-parts":[[2014,1,1]],"date-time":"2014-01-01T00:00:00Z","timestamp":1388534400000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2014]]},"DOI":"10.1007\/978-0-85729-859-1_26","type":"book-chapter","created":{"date-parts":[[2014,5,12]],"date-time":"2014-05-12T04:40:25Z","timestamp":1399869625000},"page":"775-804","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["Analysis of Documents Born Digital"],"prefix":"10.1007","author":[{"given":"Jianying","family":"Hu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ying","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2019,7,24]]},"reference":[{"key":"26_CR1","doi-asserted-by":"crossref","unstructured":"Adelberg B (1998) NoDoSE \u2013 a tool for semi-automatically extracting structured and Semi-structured data from text documents. In: ACM SIGMOD international conference on management of data (SIGMOD\u201998), Seattle, pp\u00a0283\u2013294","DOI":"10.1145\/276305.276330"},{"key":"26_CR2","doi-asserted-by":"crossref","unstructured":"Ailon N, Charikar M, Newman A (2005) Aggregating inconsistent information: ranking and clustering. In: 37th STOC, Baltimore, pp 684\u2013693","DOI":"10.1145\/1060590.1060692"},{"key":"26_CR3","doi-asserted-by":"crossref","unstructured":"Anjewierden A (2001) AIDAS: incremental logical structure discovery in PDF documents. In: 6th international conference on document analysis and recognition (ICDAR), Seattle, Sept 2001, pp\u00a0374\u2013378","DOI":"10.1109\/ICDAR.2001.953816"},{"volume-title":"Web document analysis: challenges and opportunities","year":"2004","key":"26_CR4","unstructured":"Antonacopoulos A, Hu J (ed) (2004) Web document analysis: challenges and opportunities. World Scientific, Singapore"},{"key":"26_CR5","doi-asserted-by":"publisher","first-page":"406","DOI":"10.1007\/3-540-36901-5_42","volume-title":"Web Technologies and Applications","author":"Deng Cai","year":"2003","unstructured":"Cai D, Yu S, Wen J-R, Ma W-Y (2003) Extracting content structure for web pages based on visual representation. In 5th Asia Pacific Web Conference, pp\u00a0406\u2013415"},{"key":"26_CR6","unstructured":"Califf ME, Mooney RJ (1999) Relational learning of pattern-match rules for information extraction. In: Proceedings of the sixteenth national conference on artificial intelligence and the eleventh innovative applications of artificial intelligence conference, AAAI\u201999\/IAAI\u201999, Orlando. Menlo Park, pp\u00a06\u201311"},{"key":"26_CR7","doi-asserted-by":"crossref","unstructured":"Chakrabarti D, Kumar R, Punera K (2008) A graph-theoretic approach to webpage segmentation. In: WWW 2008, Beijing, pp\u00a0377\u2013386","DOI":"10.1145\/1367497.1367549"},{"key":"26_CR8","series-title":"Lecture notes in computer science","doi-asserted-by":"publisher","first-page":"13","DOI":"10.1007\/978-3-540-28640-0_20","volume-title":"Document analysis systems VI","author":"H Chao","year":"2004","unstructured":"Chao H, Fan J (2004) Layout and content extraction for pdf documents. In: Marinai S, Dengel A (eds) Document analysis systems VI. Lecture notes in computer science, vol 3163. Springer, New York\/Berlin, pp\u00a013\u2013224"},{"key":"26_CR9","unstructured":"Chen JS, Tseng DC (1996) Overlapped-character separation and construction for table-form documents. In: IEEE international conference on image processing (ICIP), Lausanne, pp\u00a0233\u2013236"},{"issue":"1","key":"26_CR10","doi-asserted-by":"publisher","first-page":"50","DOI":"10.1109\/MIC.2005.5","volume":"9","author":"Y Chen","year":"2005","unstructured":"Chen Y, Xie X, Ma W-Y, Zhang H-J (2005) Adapting web pages for small-screen devices. Internet Computing, 9(1):50\u201356","journal-title":"Internet Computing"},{"key":"26_CR11","first-page":"1","volume-title":"Qualitative spatial representation and reasoning techniques, vol 1303","author":"AG Cohn","year":"1997","unstructured":"Cohn AG (1997) Qualitative spatial representation and reasoning techniques, vol 1303. Springer, Berlin, pp\u00a01\u201330"},{"issue":"1","key":"26_CR12","doi-asserted-by":"publisher","first-page":"17","DOI":"10.1007\/s100320100077","volume":"5","author":"P Duygulu","year":"2002","unstructured":"Duygulu P, Atalay V (2002) A hierarchical representation of form documents for identification and retrieval. IJDAR 5(1):17\u201327","journal-title":"IJDAR"},{"issue":"3","key":"26_CR13","doi-asserted-by":"publisher","first-page":"227","DOI":"10.1016\/S0169-023X(99)00027-0","volume":"31","author":"DW Embley","year":"1999","unstructured":"Embley DW, Campbell DM, Jiang YS, Liddle SW, Lonsdale DW, Ng Y-K, Smith RD (1999) Conceptual-model-based data extraction from multiple-record web pages. Data Knowl Eng 31(3):227\u2013251","journal-title":"Data Knowl Eng"},{"key":"26_CR14","unstructured":"Finn A, Kushmerick N, Smyth B (2001) Fact or fiction: content classification for digital libraries. In: Joint DELOS-NSF workshop on personalisation and recommender systems in digital libraries, Dublin, p\u00a01"},{"key":"26_CR15","doi-asserted-by":"crossref","unstructured":"Futrelle RP, Shao M, Cieslik C, Grimes AE (2003) Extraction, layout analysis and classification of diagrams in PDF documents. In: International conference on document analysis and recognition (ICDAR) 2003, proceedings, Edinburgh, vol 2, p 1007","DOI":"10.1109\/ICDAR.2003.1227811"},{"key":"26_CR16","unstructured":"Gatterbauer W, Bohunsky P (2006) Table extraction using spatial reasoning on the CSS2 visual box model. In: Proceedings of the 21st national conference on artificial intelligence (AAAI), Boston, vol 2, pp\u00a01313\u20131318"},{"issue":"5","key":"26_CR17","doi-asserted-by":"publisher","first-page":"20","DOI":"10.5120\/1945-2601","volume":"15","author":"Neha Gupta","year":"2011","unstructured":"Gupta N, Hilal S Dr (2011) A heuristic approach for web content extraction. Int J Comput Appl 15(5):20\u201324","journal-title":"International Journal of Computer Applications"},{"key":"26_CR18","doi-asserted-by":"crossref","unstructured":"Hadjar K, Rigamonti M, Lalanne D, Ingold R (2004) Xed: a new tool for extracting hidden structures from electronic documents. In: Document image analysis for libraries, Palo Alto, pp\u00a0212\u2013224","DOI":"10.1109\/DIAL.2004.1263250"},{"key":"26_CR19","doi-asserted-by":"crossref","unstructured":"Hassan T (2009) Object-level document analysis of PDF files. In: Proceedings of the 9th ACM symposium on document engineering (DocEng\u201909), Munich. ACM, New York, pp\u00a047\u201355","DOI":"10.1145\/1600193.1600206"},{"key":"26_CR20","doi-asserted-by":"crossref","unstructured":"Hassan T (2009) User-guided wrapping of PDF documents using graph matching techniques. In: International conference on document analysis and recognition \u2013 ICDAR, Barcelona, pp\u00a0631\u2013635","DOI":"10.1109\/ICDAR.2009.238"},{"key":"26_CR21","unstructured":"Hurst M (2001) Layout and language: challenges for table understanding on the web. In: Proceedings of the 1st international workshop on web document analysis, Seattle"},{"issue":"12","key":"26_CR22","doi-asserted-by":"publisher","first-page":"2055","DOI":"10.1016\/S0031-3203(98)00067-3","volume":"31","author":"AK Jain","year":"1998","unstructured":"Jain AK, Yu B (1998) Automatic text location in images and video frames. Pattern Recognit 31(12):2055\u20132076","journal-title":"Pattern Recognit"},{"key":"26_CR23","unstructured":"Karatzas D (2002) Text segmentation in web images using colour perception and topological features. PhD Thesis, University of Liverpool"},{"issue":"5","key":"26_CR24","doi-asserted-by":"publisher","first-page":"564","DOI":"10.1016\/j.imavis.2006.05.003","volume":"25","author":"D Karatzas","year":"2007","unstructured":"Karatzas D, Anotnacopoulos A (2007) Colour text segmentation in web images based on human perception. Image Vis Comput 25(5):564\u2013577","journal-title":"Image Vis Comput"},{"key":"26_CR25","first-page":"268","volume":"13","author":"J Kong","year":"2006","unstructured":"Kong J, Zhang K, Zeng X (2006) Spatial graph grammars for graphical user interfaces. CHI 13:268\u2013307","journal-title":"CHI"},{"key":"26_CR26","doi-asserted-by":"crossref","unstructured":"Krupl B, Herzog M, Gatterbauer W (2005) Using visual cues for extraction of tabular data from arbitrary HTML documents. In: Proceedings of the 14th international conference on world wide web (WWW), Chiba","DOI":"10.1145\/1062745.1062838"},{"issue":"1\u20132","key":"26_CR27","first-page":"15","volume":"118","author":"N Kushmerick","year":"2000","unstructured":"Kushmerick N (2000) Wrapper induction: efficiency and expressiveness. Artif Intell Spec Issue Intell Internet Syst 118(1\u20132):15\u201368","journal-title":"Artif Intell Spec Issue Intell Internet Syst"},{"issue":"2","key":"26_CR28","doi-asserted-by":"publisher","first-page":"84","DOI":"10.1145\/565117.565137","volume":"31","author":"AHF Laender","year":"2002","unstructured":"Laender AHF, Ribeiro-Neto BA, da Silva AS, Teixeira JS (2002) A brief survey of web data extraction tools. ACM SIGMOD Rec Homepage Arch 31(2):84\u201393","journal-title":"ACM SIGMOD Rec Homepage Arch"},{"issue":"2","key":"26_CR29","doi-asserted-by":"publisher","first-page":"121","DOI":"10.1016\/S0169-023X(01)00047-7","volume":"40","author":"AHF Laender","year":"2002","unstructured":"Laender AHF, Ribeiro-Neto B, da Silva AS (2002) DEByE \u2013 date extraction by example. Data Knowl Eng 40(2):121\u2013154","journal-title":"Data Knowl Eng"},{"key":"26_CR30","unstructured":"Lien Y-LL (1989) Apparatus and method for vectorization of incoming scanned image data. United States Patent US4,817,187, assigned to GTX Corporation, Phoenix, Arizona, 28 Mar 1989"},{"key":"26_CR31","doi-asserted-by":"crossref","unstructured":"Liu Y, Bai K, Mitra P, Lee Giles C (2007) TableSeer: automatic table metadata extraction and searching in digital libraries. In: ACM\/IEEE joint conference on digital libraries, Vancouver, pp\u00a091\u2013100","DOI":"10.1145\/1255175.1255193"},{"issue":"2\/3","key":"26_CR32","doi-asserted-by":"publisher","first-page":"177","DOI":"10.1023\/A:1009954710479","volume":"2","author":"D Lopresti","year":"2000","unstructured":"Lopresti D, Zhou J (2000) Locating and recognizing text in WWW images. Inf Retr 2(2\/3):177\u2013206","journal-title":"Inf Retr"},{"issue":"3","key":"26_CR33","first-page":"207","volume":"8","author":"W Lovegrove","year":"1995","unstructured":"Lovegrove W, Brailsford D (1995) Document analysis of PDF files: methods, results and implications. Electron Publ Orig Dissem Des 8(3):207\u2013220","journal-title":"Electron Publ Orig Dissem Des"},{"key":"26_CR34","doi-asserted-by":"crossref","unstructured":"Luo P, Fan J, Liu S, Lin F, Xiong Y, Liu J (2009) Web article extraction for web printing: a DOM+visual based approach. In: Proceedings of the DocEng, Munich. ACM, pp\u00a066\u201369","DOI":"10.1145\/1600193.1600208"},{"key":"26_CR35","doi-asserted-by":"crossref","unstructured":"Marinai S (2009) Metadata extraction from PDF papers for digital library ingest. In: Proceedings of the 10th international conference on document analysis and recognition (ICDAR), Barcelona, pp\u00a0251\u2013255","DOI":"10.1109\/ICDAR.2009.232"},{"key":"26_CR36","unstructured":"McKeown KR, Barzilay R, Evans D, Hatzivassiloglou V, Kan MY, Schiffman B, Teufel S (2001) Columbia multi-document summarization: approach and evaluation. In: Document understanding conference, New Orleans"},{"key":"26_CR37","unstructured":"Okun O, Doermann D, Pietikainen M (1999) Page segmentation and zone classification: the state of the art. Technical report: LAMP-TR-036\/CAR-TR-927\/CS-TR-4079, University of Maryland, College Park, Nov 1999"},{"key":"26_CR38","doi-asserted-by":"crossref","unstructured":"Oro E, Ruffolo M (2009) PDF-TREX: an approach for recognizing and extracting tables from PDF documents. In: ICDAR\u201909 proceedings of the 2009 10th international conference on document analysis and recognition, Barcelona, pp\u00a0906\u2013910","DOI":"10.1109\/ICDAR.2009.12"},{"key":"26_CR39","unstructured":"Petrie H, Harrison C, Dev S (2005) Describing images on the web: a survey of current practice and prospects for the future. In: Proceedings of human computer interaction international (HCII), Las Vegas, July 2005"},{"issue":"2\u20133","key":"26_CR40","first-page":"153","volume":"8","author":"PN Smith","year":"1995","unstructured":"Smith PN, Brailsford DF (1995) Towards structured, block-based PDF. Electron Publ Orig Dissem Des 8(2\u20133):153\u2013165","journal-title":"Electron Publ Orig Dissem Des"},{"issue":"1\u20133","key":"26_CR41","first-page":"233","volume":"34","author":"S Soderland","year":"1999","unstructured":"Soderland S, Cardie C, Mooney R (1999) Learning information extraction rules for semi-structured and free text. Mach Learn Spec Issue Nat Lang Learn 34(1\u20133):233\u2013272","journal-title":"Mach Learn Spec Issue Nat Lang Learn"},{"key":"26_CR42","first-page":"249","volume-title":"Lecture Notes in Computer Science","author":"Yalin Wang","year":"2002","unstructured":"Wang Y, Hu J (2002) Detecting tables in HTML documents. In: Fifth IAPR international workshop on document analysis systems, Princeton, Aug 2002. Lecture notes in computer science, vol 2423, pp\u00a0249\u2013260"},{"key":"26_CR43","unstructured":"Wang Y, Phillips IT, Haralick RM (2000) Statistical-based approach to word segmentation, In: 15th international conference on pattern recognition, ICPR2000, vol 4. Barcelona, Spain, pp\u00a0555\u2013558"},{"key":"26_CR44","series-title":"Lecture notes in computer science","first-page":"29","volume-title":"Document analysis systems","author":"HC Wasserman","year":"2002","unstructured":"Wasserman HC, Yukawa K, Sy BK, Kwok K-L, Phillips IT (2002) A theoretical foundation and a method for document table structure extraction and decomposition. In: Lopresti DP, Hu J, Kashi R (eds) Document analysis systems. Lecture notes in computer science, vol 2423. Springer, Berlin\/New York, pp\u00a029\u2013294"},{"key":"26_CR45","volume-title":"Color science: concepts and methods, quantitative data and formulae","author":"G Wyszecki","year":"1982","unstructured":"Wyszecki G, Stiles W (1982) Color science: concepts and methods, quantitative data and formulae, 2nd edn. Wiley, New York","edition":"2"},{"key":"26_CR46","unstructured":"Yildiz B, Kaiser K, Miksch S (2005) pdf2table: a method to extract table information from PDF files. In: Proceedings of the 2nd Indian international conference on artificial intelligence (IICAI05), Pune, pp\u00a01773\u20131785"},{"issue":"1","key":"26_CR47","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s10032-004-0120-9","volume":"7","author":"R Zanibbi","year":"2004","unstructured":"Zanibbi R, Blostein D, Cordy JR (2004) A survey of table recognition: models, observations, transformations, and inferences. Int J Doc Anal Recognit 7(1):1\u201316","journal-title":"Int J Doc Anal Recognit"},{"key":"26_CR48","doi-asserted-by":"crossref","unstructured":"Zhu J, Nie Z, Wen J-R, Zhang B, Ma W-Y (2005) 2D conditional random fields for web information extraction. In: Proceedings of the ICML\u201905, Bonn. ACM, pp\u00a01044\u20131051","DOI":"10.1145\/1102351.1102483"}],"container-title":["Handbook of Document Image Processing and Recognition"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-0-85729-859-1_26","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,7,23]],"date-time":"2019-07-23T09:06:55Z","timestamp":1563872815000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-0-85729-859-1_26"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2014]]},"ISBN":["9780857298584","9780857298591"],"references-count":48,"URL":"https:\/\/doi.org\/10.1007\/978-0-85729-859-1_26","relation":{},"subject":[],"published":{"date-parts":[[2014]]},"assertion":[{"value":"24 July 2019","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}}]}}