{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,29]],"date-time":"2025-09-29T08:11:33Z","timestamp":1759133493404,"version":"3.37.3"},"reference-count":48,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2018,6,9]],"date-time":"2018-06-09T00:00:00Z","timestamp":1528502400000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["IJDAR"],"published-print":{"date-parts":[[2018,9]]},"DOI":"10.1007\/s10032-018-0305-2","type":"journal-article","created":{"date-parts":[[2018,6,9]],"date-time":"2018-06-09T11:00:46Z","timestamp":1528542046000},"page":"161-175","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":11,"title":["Learning to detect, localize and recognize many text objects in document images from few examples"],"prefix":"10.1007","volume":"21","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7218-4158","authenticated-orcid":false,"given":"Bastien","family":"Moysset","sequence":"first","affiliation":[]},{"given":"Christopher","family":"Kermorvant","sequence":"additional","affiliation":[]},{"given":"Christian","family":"Wolf","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2018,6,9]]},"reference":[{"issue":"2","key":"305_CR1","doi-asserted-by":"publisher","first-page":"97","DOI":"10.1007\/s00521-004-0444-x","volume":"14","author":"S Behnke","year":"2005","unstructured":"Behnke, S.: Face localization and tracking in the neural abstraction pyramid. Neural Comput. Appl. 14(2), 97\u2013103 (2005)","journal-title":"Neural Comput. Appl."},{"key":"305_CR2","doi-asserted-by":"crossref","unstructured":"Bell, S., Zitnick, L., Bala, K., Girshick, R.: Inside-outside net: Detecting objects in context with skip pooling and recurrent neural networks. In: IEEE Conference on Computer Vision and Pattern Recognition, Las Vegas (2016)","DOI":"10.1109\/CVPR.2016.314"},{"key":"305_CR3","unstructured":"Bluche, T.: Joint line segmentation and transcription for end-to-end handwritten paragraph recognition. In: Advances in Neural Information Processing System, Barcelona (2016)"},{"key":"305_CR4","doi-asserted-by":"crossref","unstructured":"Bluche, T., Moysset, B., Kermorvant, C.: Automatic line segmentation and ground-truth alignment of handwritten documents. In: International Conference on Frontiers in Handwriting Recognition, Crete (2014)","DOI":"10.1109\/ICFHR.2014.117"},{"key":"305_CR5","doi-asserted-by":"crossref","unstructured":"Brunessaux, S., Giroux, P., Grilheres, B., Manta, M., Bodin, M., Choukri, K., Galibert, O., Kahn, J.: The maurdor project\u2014improving automatic processing of digital documents. In: Document Analysis Systems, Tours (2014)","DOI":"10.1109\/DAS.2014.58"},{"key":"305_CR6","unstructured":"Chen, K., Seuret, M., Hennebert, J., Ingold, R.: Convolutional neural networks for page segmentation of historical document images. In: 14th IAPR International Conference on Document Analysis and Recognition (ICDAR), 2017, Kyoto, vol.\u00a01, pp. 965\u2013970. IEEE (2017)"},{"key":"305_CR7","unstructured":"Dai, J., Li, Y., He, K., Sun, J.: R-fcn: object detection via region-based fully convolutional networks. In: Advances in neural information processing systems, Barcelona, pp. 379\u2013387 (2016)"},{"key":"305_CR8","unstructured":"Delakis, M., Garcia, C.: Text detection with convolutional neural networks. In: International Conference on Computer Vision Theory and Applications, Madeira, pp. 290\u2013294 (2008)"},{"key":"305_CR9","unstructured":"Doetsch, P., Zeyer, A., Voigtlaender, P., Kulikov, I., Schl\u00fcter, R., Ney, H.: Returnn: The rwth extensible training framework for universal recurrent neural networks. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2017, New Orleans, pp. 5345\u20135349. IEEE (2017)"},{"key":"305_CR10","doi-asserted-by":"crossref","unstructured":"Erhan, D., Szegedy, C., Toshev, A., Anguelov, D.: Scalable object detection using deep neural networks. In: IEEE Conference on Computer Vision and Pattern Recognition, Colombus (2014)","DOI":"10.1109\/CVPR.2014.276"},{"key":"305_CR11","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/j.patcog.2016.10.023","volume":"64","author":"S Eskenazi","year":"2017","unstructured":"Eskenazi, S., Gomez-Kr\u00e4mer, P., Ogier, J.M.: A comprehensive survey of mostly textual document segmentation algorithms since 2008. Pattern Recognit. 64, 1\u201314 (2017)","journal-title":"Pattern Recognit."},{"issue":"2","key":"305_CR12","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1007\/s11263-009-0275-4","volume":"88","author":"M Everingham","year":"2010","unstructured":"Everingham, M., Van Gool, L., Williams, C.K., Winn, J., Zisserman, A.: The pascal visual object classes (voc) challenge. Int. J. Comput. Vis. 88(2), 303\u2013338 (2010)","journal-title":"Int. J. Comput. Vis."},{"issue":"9","key":"305_CR13","doi-asserted-by":"publisher","first-page":"1627","DOI":"10.1109\/TPAMI.2009.167","volume":"32","author":"PF Felzenszwalb","year":"2010","unstructured":"Felzenszwalb, P.F., Girshick, R.B., McAllester, D., Ramanan, D.: Object detection with discriminatively trained part-based models. IEEE Trans. Pattern Anal. Mach. Intell. 32(9), 1627\u20131645 (2010)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"305_CR14","doi-asserted-by":"crossref","unstructured":"Girshick, R.: Fast R-CNN. In: International Conference on Computer Vision, Santiago (2015)","DOI":"10.1109\/ICCV.2015.169"},{"key":"305_CR15","doi-asserted-by":"crossref","unstructured":"Girshick, R., Donahue, J., Darrell, T., Malik, J.: Rich feature hierarchies for accurate object detection and semantic segmentation. In: IEEE Conference on Computer Vision and Pattern Recognition, Colombus (2014)","DOI":"10.1109\/CVPR.2014.81"},{"key":"305_CR16","doi-asserted-by":"crossref","unstructured":"Graves, A., Fernandez, S., Gomez, F., Schmidhuber, J.: Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In: International Conference on Machine Learning, Pittsburgh (2006)","DOI":"10.1145\/1143844.1143891"},{"key":"305_CR17","unstructured":"Graves, A., Schmidhuber, J.: Offline handwriting recognition with multidimensional recurrent neural networks. In: Advances in Neural Information Processing System, Vancouver (2008)"},{"key":"305_CR18","doi-asserted-by":"crossref","unstructured":"Gupta, A., Vedaldi, A., Zisserman, A.: Synthetic data for text localisation in natural images. In: IEEE Conference on Computer Vision and Pattern Recognition, Las Vegas (2016)","DOI":"10.1109\/CVPR.2016.254"},{"key":"305_CR19","unstructured":"Iandola, F., Hand, S., Moskewicz, M., Ashraf, K.: Squeezenet: Alexnet-level accuracy with $$50 \\times $$ 50 \u00d7 fewer parameters and $$< 0.5\\text{MB}$$ < 0.5 MB model size. In: Openreview submission to ICLR 2017, Toulon (2016)"},{"issue":"1","key":"305_CR20","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s11263-015-0823-z","volume":"116","author":"M Jaderberg","year":"2016","unstructured":"Jaderberg, M., Simonyan, K., Vedaldi, A., Zisserman, A.: Reading text in the wild with convolutional neural networks. Int J Comput Vis 116(1), 1\u201320 (2016)","journal-title":"Int J Comput Vis"},{"issue":"2\u20134","key":"305_CR21","doi-asserted-by":"publisher","first-page":"123","DOI":"10.1007\/s10032-006-0023-z","volume":"9","author":"L Likforman-Sulem","year":"2007","unstructured":"Likforman-Sulem, L., Zahour, A., Taconet, B.: Text line segmentation of historical documents: a survey. Int J Doc Anal Recognit 9(2\u20134), 123\u2013138 (2007)","journal-title":"Int J Doc Anal Recognit"},{"key":"305_CR22","doi-asserted-by":"crossref","unstructured":"Liu, W., Anguelov, D., Erhan, D., Szegedy, C., Reed, S., Fu, C.Y., Berg, A.: Ssd: single shot multibox detector. In: European Conference on Computer Vision, Amsterdam (2016)","DOI":"10.1007\/978-3-319-46448-0_2"},{"key":"305_CR23","doi-asserted-by":"crossref","unstructured":"Liu, Y., Jin, L.: Deep matching prior network: toward tighter multi-oriented text detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, Honolulu, vol.\u00a02, p.\u00a08 (2017)","DOI":"10.1109\/CVPR.2017.368"},{"key":"305_CR24","doi-asserted-by":"crossref","unstructured":"Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: IEEE Conference on Computer Vision and Pattern Recognition, Boston (2015)","DOI":"10.1109\/CVPR.2015.7298965"},{"issue":"2","key":"305_CR25","doi-asserted-by":"publisher","first-page":"91","DOI":"10.1023\/B:VISI.0000029664.99615.94","volume":"60","author":"D Lowe","year":"2004","unstructured":"Lowe, D.: Distinctive image features from scale-invariant keypoints. Int. J. Comput. Vis. 60(2), 91\u2013110 (2004)","journal-title":"Int. J. Comput. Vis."},{"key":"305_CR26","doi-asserted-by":"crossref","unstructured":"Ma, J., Shao, W., Ye, H., Wang, L., Wang, H., Zheng, Y., Xue, X.: Arbitrary-oriented scene text detection via rotation proposals. IEEE Transactions on Multimedia (2018)","DOI":"10.1109\/TMM.2018.2818020"},{"issue":"5","key":"305_CR27","doi-asserted-by":"publisher","first-page":"791","DOI":"10.1016\/S0031-3203(98)00108-3","volume":"32","author":"S Messelodi","year":"1999","unstructured":"Messelodi, S., Modena, C.M.: Automatic identification and skew estimation of text lines in real scene images. Pattern Recogn. 32(5), 791\u2013810 (1999)","journal-title":"Pattern Recogn."},{"key":"305_CR28","doi-asserted-by":"crossref","unstructured":"Mordan, T., Thome, N., Cord, M., Henaff, G.: Deformable part-based fully convolutional network for object detection. In: British Machine Vision Conference (BMVC), London (2017)","DOI":"10.5244\/C.31.88"},{"issue":"1","key":"305_CR29","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1137\/0105003","volume":"5","author":"J Munkres","year":"1957","unstructured":"Munkres, J.: Algorithms for the assignment and transportation problems. J. Soc. Ind. Appl. Math. 5(1), 32\u201338 (1957)","journal-title":"J. Soc. Ind. Appl. Math."},{"key":"305_CR30","doi-asserted-by":"crossref","unstructured":"Nicolaou, A., Gatos, B.: Handwritten Text Line Segmentation by Shredding Text into its Lines. In: International Conference on Document Analysis and Recognition, Barcelona (2009)","DOI":"10.1109\/ICDAR.2009.243"},{"key":"305_CR31","doi-asserted-by":"crossref","unstructured":"Pham, V., Bluche, T., Kermorvant, C., Louradour, J.: Dropout improves recurrent neural networks for handwriting recognition. In: International Conference on Frontiers in Handwriting Recognition, Crete (2014)","DOI":"10.1109\/ICFHR.2014.55"},{"key":"305_CR32","doi-asserted-by":"crossref","unstructured":"Pinheiro, P., Collobert, R.: From image-level to pixel-level labeling with convolutional networks. In: IEEE Conference on Computer Vision and Pattern Recognition, Boston (2015)","DOI":"10.1109\/CVPR.2015.7298780"},{"key":"305_CR33","doi-asserted-by":"crossref","unstructured":"Pinheiro, P., Lin, T., Collobert, R., Dollar, P.: Learning to refine object segments. In: European Conference on Computer Vision, Amsterdam (2016)","DOI":"10.1007\/978-3-319-46448-0_5"},{"key":"305_CR34","doi-asserted-by":"crossref","unstructured":"Pletschacher, S., Clausner, C., Antonacopoulos, A.: Europeana newspapers ocr workflow evaluation. In: Workshop on Historical Document Imaging and Processing, Nancy (2015)","DOI":"10.1145\/2809544.2809554"},{"key":"305_CR35","doi-asserted-by":"crossref","unstructured":"Redmon, J., Divvala, S., Girshick, R., Farhadi, A.: You only look once: Unified, real-time object detection. In: IEEE Conference on Computer Vision and Pattern Recognition, Las Vegas (2016)","DOI":"10.1109\/CVPR.2016.91"},{"key":"305_CR36","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: Towards real-time object detection with region proposal networks. In: Advances in Neural Information Processing System, Montreal (2015)"},{"issue":"3","key":"305_CR37","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., Deng, J., Su, H., Krause, J., Satheesh, S., Ma, S., Huang, Z., Karpathy, A., Khosla, A., Bernstein, M., Berg, A.C., Fei-Fei, L.: ImageNet Large Scale Visual Recognition Challenge. Int. J. Comput. Vis. 115(3), 211\u2013252 (2015). https:\/\/doi.org\/10.1007\/s11263-015-0816-y","journal-title":"Int. J. Comput. Vis."},{"issue":"9","key":"305_CR38","doi-asserted-by":"publisher","first-page":"1115","DOI":"10.1109\/LSP.2014.2325940","volume":"21","author":"J Ryu","year":"2014","unstructured":"Ryu, J., Koo, H.I., Cho, N.I.: Language-independent text-line extraction algorithm for handwritten documents. Signal Process. Lett. 21(9), 1115\u20131119 (2014)","journal-title":"Signal Process. Lett."},{"key":"305_CR39","doi-asserted-by":"crossref","unstructured":"Shi, Z., Setlur, S., Govindaraju, V.: A Steerable Directional Local Profile Technique for Extraction of Handwritten Arabic Text Lines. In: International Conference on Document Analysis and Recognition, Barcelona (2009)","DOI":"10.1109\/ICDAR.2009.79"},{"key":"305_CR40","doi-asserted-by":"crossref","unstructured":"Stafylakis, T., Papavassiliou, V., Katsouros, V., Carayannis, G.: Robust text-line and word segmentation for handwritten documents images. In: IEEE International Conference on Acoustics, Speech and Signal Processing, Las Vegas, pp. 3393\u20133396. IEEE (2008)","DOI":"10.1109\/ICASSP.2008.4518379"},{"key":"305_CR41","doi-asserted-by":"crossref","unstructured":"Stewart, R., Ermon, S.: Label-free supervision of neural networks with physics and domain knowledge. In: AAAI, San Francisco, pp. 2576\u20132582 (2017)","DOI":"10.1609\/aaai.v31i1.10934"},{"key":"305_CR42","unstructured":"Szegedy, C., Reed, S., Erhan, D., Anguelov, D.: Scalable, high-quality object detection. arXiv:1412.1441 (2015)"},{"key":"305_CR43","unstructured":"Vinyals, O., Blundell, C., Lillicrap, T., Wierstra, D., et\u00a0al.: Matching networks for one shot learning. In: Advances in Neural Information Processing Systems, Barcelona (2016)"},{"issue":"4","key":"305_CR44","doi-asserted-by":"publisher","first-page":"280","DOI":"10.1007\/s10032-006-0014-0","volume":"8","author":"C Wolf","year":"2006","unstructured":"Wolf, C., Jolion, J.M.: Object count\/area graphs for the evaluation of object detection and segmentation algorithms. Int. J. Doc. Anal. Recognit. 8(4), 280\u2013296 (2006)","journal-title":"Int. J. Doc. Anal. Recognit."},{"key":"305_CR45","unstructured":"Wonmin, W., Breuel, T., Raue, F., Liwicki, M.: Scene labeling with LSTM recurrent neural networks. In: IEEE Conference on Computer Vision and Pattern Recognition, Boston (2015)"},{"issue":"12","key":"305_CR46","doi-asserted-by":"publisher","first-page":"3146","DOI":"10.1016\/j.patcog.2008.12.013","volume":"42","author":"F Yin","year":"2009","unstructured":"Yin, F., Liu, C.L.: Handwritten chinese text line segmentation by clustering with distance metric learning. Pattern Recogn. 42(12), 3146\u20133157 (2009)","journal-title":"Pattern Recogn."},{"key":"305_CR47","unstructured":"Yu, F., Koltun, V.: Multi-scale context aggregation by dilated convolutions. In: International Conference on Learning Representations, San Juan (2016)"},{"key":"305_CR48","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Zhang, C., Shen, W., Yao, C., Liu, W., Bai, X.: Multi-oriented text detection with fully convolutional networks. In: IEEE Conference on Computer Vision and Pattern Recognition, Las Vegas (2016)","DOI":"10.1109\/CVPR.2016.451"}],"container-title":["International Journal on Document Analysis and Recognition (IJDAR)"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10032-018-0305-2\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10032-018-0305-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10032-018-0305-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,8,25]],"date-time":"2022-08-25T10:19:09Z","timestamp":1661422749000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10032-018-0305-2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,6,9]]},"references-count":48,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2018,9]]}},"alternative-id":["305"],"URL":"https:\/\/doi.org\/10.1007\/s10032-018-0305-2","relation":{},"ISSN":["1433-2833","1433-2825"],"issn-type":[{"type":"print","value":"1433-2833"},{"type":"electronic","value":"1433-2825"}],"subject":[],"published":{"date-parts":[[2018,6,9]]},"assertion":[{"value":"29 September 2017","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 May 2018","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 May 2018","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 June 2018","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}