{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T09:06:21Z","timestamp":1781514381055,"version":"3.54.1"},"reference-count":57,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2009,9,9]],"date-time":"2009-09-09T00:00:00Z","timestamp":1252454400000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2010,6]]},"DOI":"10.1007\/s11263-009-0275-4","type":"journal-article","created":{"date-parts":[[2009,9,8]],"date-time":"2009-09-08T17:15:27Z","timestamp":1252430127000},"page":"303-338","source":"Crossref","is-referenced-by-count":16120,"title":["The Pascal Visual Object Classes (VOC) Challenge"],"prefix":"10.1007","volume":"88","author":[{"given":"Mark","family":"Everingham","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Luc","family":"Van Gool","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Christopher K. I.","family":"Williams","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"John","family":"Winn","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Andrew","family":"Zisserman","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2009,9,9]]},"reference":[{"key":"275_CR1","doi-asserted-by":"crossref","unstructured":"Bergtholdt, M., Kappes, J., & Schn\u00f6rr, C. (2006). Learning of graphical models and efficient inference for object class recognition. In Proceedings of the annual symposium of the German association for pattern recognition (DAGM06) (pp. 273\u2013283)","DOI":"10.1007\/11861898_28"},{"key":"275_CR2","doi-asserted-by":"crossref","unstructured":"Chum, O., & Zisserman, A. (2007). An exemplar model for learning object classes. In Proceedings of the IEEE conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2007.383050"},{"key":"275_CR3","doi-asserted-by":"crossref","unstructured":"Chum, O., Philbin, J., Isard, M., & Zisserman, A. (2007). Scalable near identical image and shot detection. In Proceedings of the international conference on image and video retrieval (pp. 549\u2013556).","DOI":"10.1145\/1282280.1282359"},{"key":"275_CR4","unstructured":"Csurka, G., Bray, C., Dance, C., & Fan, L. (2004). Visual categorization with bags of keypoints. In Workshop on statistical learning in computer vision, ECCV (pp. 1\u201322)."},{"key":"275_CR5","doi-asserted-by":"crossref","unstructured":"Dalal, N., & Triggs, B. (2005). Histograms of oriented gradients for human detection. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 886\u2013893).","DOI":"10.1109\/CVPR.2005.177"},{"key":"275_CR6","first-page":"1","volume":"7","author":"J. Demsar","year":"2006","unstructured":"Demsar, J. (2006). Statistical comparisons of classifiers over multiple data sets. Journal of Machine Learning Research, 7, 1\u201330.","journal-title":"Journal of Machine Learning Research"},{"key":"275_CR7","doi-asserted-by":"crossref","unstructured":"Duygulu, P., Barnard, K., de Freitas, N., & Forsyth, D. A. (2002). Object recognition as machine translation: Learning a lexicon for a fixed image vocabulary. In Proceedings of the European conference on computer vision (pp. 97\u2013112).","DOI":"10.1007\/3-540-47979-1_7"},{"key":"275_CR8","series-title":"LNAI","doi-asserted-by":"crossref","first-page":"117","DOI":"10.1007\/11736790_8","volume-title":"Machine learning challenges\u2014evaluating predictive uncertainty, visual object classification, and recognising textual entailment","author":"M. Everingham","year":"2006","unstructured":"Everingham, M., Zisserman, A., Williams, C. K. I., & Van Gool, L. (2006a). The 2005 PASCAL visual object classes challenge. In LNAI: Vol.\u00a03944. Machine learning challenges\u2014evaluating predictive uncertainty, visual object classification, and recognising textual entailment (pp.\u00a0117\u2013176). Berlin: Springer."},{"key":"275_CR9","unstructured":"Everingham, M., Zisserman, A., Williams, C. K. I., & Van Gool,\u00a0L. (2006b). The PASCAL visual object classes challenge 2006 (VOC2006) results. http:\/\/pascal-network.org\/challenges\/VOC\/voc2006\/results.pdf ."},{"key":"275_CR10","unstructured":"Everingham, M., Van Gool, L., Williams, C. K. I., Winn, J., & Zisserman, A. (2007). The PASCAL visual object classes challenge 2007 (VOC2007) Results. http:\/\/www.pascal-network.org\/challenges\/VOC\/voc2007\/index.html ."},{"issue":"4","key":"275_CR11","doi-asserted-by":"crossref","first-page":"594","DOI":"10.1109\/TPAMI.2006.79","volume":"28","author":"L. Fei-Fei","year":"2006","unstructured":"Fei-Fei, L., Fergus, R., & Perona, P. (2006). One-shot learning of object categories. IEEE Transactions on Pattern Analysis and Machine Intelligence, 28(4), 594\u2013611. http:\/\/www.vision.caltech.edu\/Image_Datasets\/Caltech101\/Caltech101.html .","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"275_CR12","volume-title":"WordNet: an electronic lexical database","year":"1998","unstructured":"Fellbaum, C. (Ed.) (1998). WordNet: an electronic lexical database. Cambridge: MIT Press."},{"key":"275_CR13","doi-asserted-by":"crossref","unstructured":"Felzenszwalb, P., McAllester, D., & Ramanan, D. (2008). A discriminatively trained, multiscale, deformable part model. In Proceedings of the IEEE conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2008.4587597"},{"key":"275_CR14","doi-asserted-by":"crossref","unstructured":"Fergus, R., Fei-Fei, L., Perona, P., & Zisserman, A. (2005). Learning object categories from Google\u2019s image search. In Proceedings of the international conference on computer vision.","DOI":"10.1109\/ICCV.2005.142"},{"issue":"3","key":"275_CR15","doi-asserted-by":"crossref","first-page":"273","DOI":"10.1007\/s11263-006-8707-x","volume":"71","author":"R. Fergus","year":"2007","unstructured":"Fergus, R., Perona, P., & Zisserman, A. (2007). Weakly supervised scale-invariant learning of models for visual recognition. International Journal of Computer Vision, 71(3), 273\u2013303.","journal-title":"International Journal of Computer Vision"},{"issue":"1","key":"275_CR16","doi-asserted-by":"crossref","first-page":"36","DOI":"10.1109\/TPAMI.2007.1144","volume":"30","author":"V. Ferrari","year":"2008","unstructured":"Ferrari, V., Fevrier, L., Jurie, F., & Schmid, C. (2008). Groups of adjacent contour segments for object detection. IEEE Transactions on Pattern Analysis and Machine Intelligence, 30(1), 36\u201351.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"275_CR17","doi-asserted-by":"crossref","unstructured":"Fritz, M., & Schiele, B. (2008). Decomposition, discovery and detection of visual categories using topic models. In Proceedings of the IEEE conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2008.4587803"},{"key":"275_CR18","doi-asserted-by":"crossref","unstructured":"Geusebroek, J. (2006). Compact object descriptors from local colour invariant histograms. In Proceedings of the British machine vision conference (pp. 1029\u20131038).","DOI":"10.5244\/C.20.105"},{"key":"275_CR19","doi-asserted-by":"crossref","unstructured":"Grauman, K., & Darrell, T. (2005). The pyramid match kernel: Discriminative classification with sets of image features. In Proceedings of the international conference on computer vision (pp.\u00a01458\u20131465).","DOI":"10.1109\/ICCV.2005.239"},{"key":"275_CR20","unstructured":"Griffin, G., Holub, A., & Perona, P. (2007). Caltech-256 object category dataset (Technical Report 7694). California Institute of Technology. http:\/\/www.vision.caltech.edu\/Image_Datasets\/Caltech256\/ ."},{"key":"275_CR21","doi-asserted-by":"crossref","unstructured":"Hoiem, D., Efros, A. A., & Hebert, M. (2006). Putting objects in perspective. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 2137\u20132144).","DOI":"10.1109\/CVPR.2006.232"},{"key":"275_CR22","doi-asserted-by":"crossref","unstructured":"Kohli, P., Ladicky, L., & Torr, P. (2008). Robust higher order potentials for enforcing label consistency. In Proceedings of the IEEE conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2008.4587417"},{"key":"275_CR23","doi-asserted-by":"crossref","unstructured":"Lampert, C. H., Blaschko, M. B., & Hofmann, T. (2008). Beyond sliding windows: Object localization by efficient subwindow search. In Proceedings of the IEEE conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2008.4587586"},{"key":"275_CR24","doi-asserted-by":"crossref","unstructured":"Laptev, I. (2006). Improvements of object detection using boosted histograms. In Proceedings of the British machine vision conference (pp. 949\u2013958).","DOI":"10.5244\/C.20.97"},{"key":"275_CR25","doi-asserted-by":"crossref","unstructured":"Lazebnik, S., Schmid, C., & Ponce, J. (2006). Beyond bags of features: Spatial pyramid matching for recognizing natural scene categories. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 2169\u20132178).","DOI":"10.1109\/CVPR.2006.68"},{"key":"275_CR26","unstructured":"Leibe, B., Leonardis, A., & Schiele, B. (2004). Combined object categorization and segmentation with an implicit shape model. In ECCV2004 workshop on statistical learning in computer vision, Prague, Czech Republic (pp. 17\u201332)."},{"key":"275_CR27","doi-asserted-by":"crossref","unstructured":"Liu, X., Wang, D., Li, J., & Zhang, B. (2007). The feature and spatial covariant kernel: Adding implicit spatial constraints to histogram. In Proceedings of the international conference on image and video retrieval.","DOI":"10.1145\/1282280.1282361"},{"issue":"2","key":"275_CR28","doi-asserted-by":"crossref","first-page":"91","DOI":"10.1023\/B:VISI.0000029664.99615.94","volume":"60","author":"D. Lowe","year":"2004","unstructured":"Lowe, D. (2004). Distinctive image features from scale-invariant keypoints. International Journal of Computer Vision, 60(2), 91\u2013110.","journal-title":"International Journal of Computer Vision"},{"key":"275_CR29","doi-asserted-by":"crossref","unstructured":"Marszalek, M., & Schmid, C. (2007). Semantic hierarchies for visual object recognition. In Proceedings of the IEEE conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2007.383272"},{"key":"275_CR30","doi-asserted-by":"crossref","unstructured":"Perronnin, F., & Dance, C. (2007). Fisher kernels on visual vocabularies for image categorization. In Proceedings of the IEEE conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2007.383266"},{"issue":"1","key":"275_CR31","doi-asserted-by":"crossref","first-page":"151","DOI":"10.1371\/journal.pcbi.0040027","volume":"4","author":"N. Pinto","year":"2008","unstructured":"Pinto, N., Cox, D., & DiCarlo, J. (2008). Why is real-world visual object recognition hard? PLoS Computational Biology, 4(1), 151\u2013156.","journal-title":"PLoS Computational Biology"},{"issue":"1\u20133","key":"275_CR32","doi-asserted-by":"crossref","first-page":"157","DOI":"10.1007\/s11263-007-0090-8","volume":"77","author":"B. Russell","year":"2008","unstructured":"Russell, B., Torralba, A., Murphy, K., & Freeman, W. T. (2008). LabelMe: a database and web-based tool for image annotation. International Journal of Computer Vision, 77(1\u20133), 157\u2013173. http:\/\/labelme.csail.mit.edu\/ .","journal-title":"International Journal of Computer Vision"},{"key":"275_CR33","volume-title":"Introduction to modern information retrieval","author":"G. Salton","year":"1986","unstructured":"Salton, G., & McGill, M. J. (1986). Introduction to modern information retrieval. New York: McGraw-Hill."},{"issue":"1\u20133","key":"275_CR34","doi-asserted-by":"crossref","first-page":"7","DOI":"10.1023\/A:1014573219977","volume":"47","author":"D. Scharstein","year":"2002","unstructured":"Scharstein, D., & Szeliski, R. (2002). A taxonomy and evaluation of dense two-frame stereo correspondence algorithms. International Journal of Computer Vision, 47(1\u20133), 7\u201342. http:\/\/vision.middlebury.edu\/stereo\/ .","journal-title":"International Journal of Computer Vision"},{"key":"275_CR35","doi-asserted-by":"crossref","unstructured":"Shotton, J., Winn, J. M., Rother, C., & Criminisi, A. (2006). TextonBoost: Joint appearance, shape and context modeling for multi-class object recognition and segmentation. In Proceedings of the European conference on computer vision (pp. 1\u201315).","DOI":"10.1007\/11744023_1"},{"key":"275_CR36","unstructured":"Sivic, J., & Zisserman, A. (2003). Video Google: A text retrieval approach to object matching in videos. In Proceedings of the international conference on computer vision (Vol. 2, pp. 1470\u20131477). http:\/\/www.robots.ox.ac.uk\/~vgg ."},{"key":"275_CR37","doi-asserted-by":"crossref","unstructured":"Smeaton, A. F., Over, P., & Kraaij, W. (2006). Evaluation campaigns and TRECVID. In MIR \u201906: Proceedings of the 8th ACM international workshop on multimedia information retrieval (pp. 321\u2013330).","DOI":"10.1145\/1178677.1178722"},{"key":"275_CR38","doi-asserted-by":"crossref","unstructured":"Snoek, C., Worring, M., & Smeulders, A. (2005). Early versus late fusion in semantic video analysis. In Proceedings of the ACM international conference on multimedia (pp. 399\u2013402).","DOI":"10.1145\/1101149.1101236"},{"key":"275_CR39","doi-asserted-by":"crossref","unstructured":"Snoek, C., Worring, M., van Gemert, J., Geusebroek, J., & Smeulders, A. (2006). The challenge problem for automated detection of 101 semantic concepts in multimedia. In Proceedings of ACM multimedia.","DOI":"10.1145\/1180639.1180727"},{"key":"275_CR40","doi-asserted-by":"crossref","unstructured":"Sorokin, A., & Forsyth, D. (2008). Utility data annotation with Amazon mechanical turk. In Proceedings of the first IEEE workshop on Internet vision (at CVPR 2008).","DOI":"10.1109\/CVPRW.2008.4562953"},{"key":"275_CR41","doi-asserted-by":"crossref","unstructured":"Spain, M., & Perona, P. (2008). Some objects are more equal than others: Measuring and predicting importance. In Proceedings of the European conference on computer vision (pp. 523\u2013536).","DOI":"10.1007\/978-3-540-88682-2_40"},{"key":"275_CR42","doi-asserted-by":"crossref","unstructured":"Stoettinger, J., Hanbury, A., Sebe, N., & Gevers, T. (2007). Do colour interest points improve image retrieval? In Proceedings of the IEEE international conference on image processing (pp. 169\u2013172).","DOI":"10.1109\/ICIP.2007.4378918"},{"issue":"1\u20133","key":"275_CR43","doi-asserted-by":"crossref","first-page":"291","DOI":"10.1007\/s11263-007-0069-5","volume":"77","author":"E. B. Sudderth","year":"2008","unstructured":"Sudderth, E. B., Torralba, A. B., Freeman, W. T., & Willsky, A. S. (2008). Describing visual scenes using transformed objects and parts. International Journal of Computer Vision, 77(1\u20133), 291\u2013330.","journal-title":"International Journal of Computer Vision"},{"issue":"2","key":"275_CR44","doi-asserted-by":"crossref","first-page":"169","DOI":"10.1023\/A:1023052124951","volume":"53","author":"A. B. Torralba","year":"2003","unstructured":"Torralba, A. B. (2003). Contextual priming for object detection. International Journal of Computer Vision, 53(2), 169\u2013191.","journal-title":"International Journal of Computer Vision"},{"issue":"5","key":"275_CR45","doi-asserted-by":"crossref","first-page":"854","DOI":"10.1109\/TPAMI.2007.1055","volume":"29","author":"A. B. Torralba","year":"2007","unstructured":"Torralba, A. B., Murphy, K. P., & Freeman, W. T. (2007). Sharing visual features for multiclass and multiview object detection. IEEE Transactions on Pattern Analysis and Machine Intelligence, 29(5), 854\u2013869.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"275_CR46","doi-asserted-by":"crossref","unstructured":"van de Sande, K. E. A., Gevers, T., & Snoek, C. G. M. (2008). Evaluation of color descriptors for object and scene recognition. In Proceedings of the IEEE conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2008.4587658"},{"key":"275_CR47","doi-asserted-by":"crossref","unstructured":"van de Weijer, J., & Schmid, C. (2006). Coloring local feature extraction. In Proceedings of the European conference on computer vision.","DOI":"10.1007\/11744047_26"},{"key":"275_CR48","doi-asserted-by":"crossref","unstructured":"van Gemert, J., Geusebroek, J., Veenman, C., Snoek, C., & Smeulders, A. (2006). Robust scene categorization by learning image statistics in context. In CVPR workshop on semantic learning applications in multimedia.","DOI":"10.1109\/CVPRW.2006.177"},{"key":"275_CR49","unstructured":"Viitaniemi, V., & Laaksonen, J. (2008). Evaluation of techniques for image classification, object detection and object segmentation (Technical Report TKK-ICS-R2). Department of Information and Computer Science, Helsinki University of Technology. http:\/\/www.cis.hut.fi\/projects\/cbir\/ ."},{"issue":"2","key":"275_CR50","doi-asserted-by":"crossref","first-page":"137","DOI":"10.1023\/B:VISI.0000013087.49260.fb","volume":"57","author":"P. A. Viola","year":"2004","unstructured":"Viola, P. A., & Jones, M. J. (2004). Robust Real-time Face Detection. International Journal of Computer Vision, 57(2), 137\u2013154.","journal-title":"International Journal of Computer Vision"},{"key":"275_CR51","doi-asserted-by":"crossref","unstructured":"von Ahn, L., & Dabbish, L. (2004). Labeling images with a computer game. In Proceedings of the ACM CHI (pp. 319\u2013326).","DOI":"10.1145\/985692.985733"},{"key":"275_CR52","doi-asserted-by":"crossref","unstructured":"Wang, D., Li, J., & Zhang, B. (2006). Relay boost fusion for learning rare concepts in multimedia. In Proceedings of the international conference on image and video retrieval.","DOI":"10.1007\/11788034_28"},{"key":"275_CR53","unstructured":"Winn, J., & Everingham, M. (2007). The PASCAL visual object classes challenge 2007 (VOC2007) annotation guidelines. http:\/\/pascallin.ecs.soton.ac.uk\/challenges\/VOC\/voc2007\/guidelines.html ."},{"key":"275_CR54","unstructured":"Yao, B., Yang, X., & Zhu, S. C. (2007). Introduction to a large scale general purpose ground truth dataset: methodology, annotation tool, and benchmarks. In Proceedings of the 6th international conference on energy minimization methods in computer vision and pattern recognition. http:\/\/www.imageparsing.com\/ ."},{"key":"275_CR55","doi-asserted-by":"crossref","unstructured":"Yilmaz, E., & Aslam, J. (2006). Estimating average precision with incomplete and imperfect judgments. In Fifteenth ACM international conference on information and knowledge management (CIKM).","DOI":"10.1145\/1183614.1183633"},{"key":"275_CR56","doi-asserted-by":"crossref","unstructured":"Zehnder, P., Koller-Meier, E., & Van Gool, L. (2008). An efficient multi-class detection cascade. In Proceedings of the British machine vision conference.","DOI":"10.5244\/C.22.80"},{"issue":"2","key":"275_CR57","doi-asserted-by":"crossref","first-page":"213","DOI":"10.1007\/s11263-006-9794-4","volume":"73","author":"J. Zhang","year":"2007","unstructured":"Zhang, J., Marszalek, M., Lazebnik, S., & Schmid, C. (2007). Local features and kernels for classification of texture and object categories: A comprehensive study. International Journal of Computer Vision, 73(2), 213\u2013238.","journal-title":"International Journal of Computer Vision"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-009-0275-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11263-009-0275-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-009-0275-4","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,6,1]],"date-time":"2019-06-01T12:16:42Z","timestamp":1559391402000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11263-009-0275-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2009,9,9]]},"references-count":57,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2010,6]]}},"alternative-id":["275"],"URL":"https:\/\/doi.org\/10.1007\/s11263-009-0275-4","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2009,9,9]]}}}