{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T10:21:28Z","timestamp":1776075688146,"version":"3.50.1"},"publisher-location":"Cham","reference-count":33,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783319105772","type":"print"},{"value":"9783319105789","type":"electronic"}],"license":[{"start":{"date-parts":[[2014,1,1]],"date-time":"2014-01-01T00:00:00Z","timestamp":1388534400000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2014]]},"DOI":"10.1007\/978-3-319-10578-9_23","type":"book-chapter","created":{"date-parts":[[2014,8,13]],"date-time":"2014-08-13T20:51:57Z","timestamp":1407963117000},"page":"346-361","source":"Crossref","is-referenced-by-count":1419,"title":["Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition"],"prefix":"10.1007","author":[{"given":"Kaiming","family":"He","sequence":"first","affiliation":[]},{"given":"Xiangyu","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Shaoqing","family":"Ren","sequence":"additional","affiliation":[]},{"given":"Jian","family":"Sun","sequence":"additional","affiliation":[]}],"member":"297","reference":[{"key":"23_CR1","doi-asserted-by":"crossref","unstructured":"Chang, C.C., Lin, C.J.: Libsvm: A library for support vector machines. ACM Transactions on Intelligent Systems and Technology, TIST (2011)","DOI":"10.1145\/1961189.1961199"},{"key":"23_CR2","doi-asserted-by":"crossref","unstructured":"Chatfield, K., Lempitsky, V., Vedaldi, A., Zisserman, A.: The devil is in the details: An evaluation of recent feature encoding methods. In: BMVC (2011)","DOI":"10.5244\/C.25.76"},{"key":"23_CR3","doi-asserted-by":"crossref","unstructured":"Cheng, M.M., Zhang, Z., Lin, W.Y., Torr, P.: BING: Binarized normed gradients for objectness estimation at 300fps. In: CVPR (2014)","DOI":"10.1109\/CVPR.2014.414"},{"key":"23_CR4","unstructured":"Coates, A., Ng, A.: The importance of encoding versus training with sparse coding and vector quantization. In: ICML (2011)"},{"key":"23_CR5","unstructured":"Dalal, N., Triggs, B.: Histograms of oriented gradients for human detection. In: CVPR (2005)"},{"key":"23_CR6","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: Imagenet: A large-scale hierarchical image database. In: CVPR (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"23_CR7","unstructured":"Donahue, J., Jia, Y., Vinyals, O., Hoffman, J., Zhang, N., Tzeng, E., Darrell, T.: Decaf: A deep convolutional activation feature for generic visual recognition. ArXiv:1310.1531 (2013)"},{"key":"23_CR8","unstructured":"Everingham, M., Van Gool, L., Williams, C.K.I., Winn, J., Zisserman, A.: The PASCAL Visual Object Classes Challenge, VOC 2007 Results (2007)"},{"key":"23_CR9","doi-asserted-by":"crossref","unstructured":"Fei-Fei, L., Fergus, R., Perona, P.: Learning generative visual models from few training examples: An incremental bayesian approach tested on 101 object categories. CVIU (2007)","DOI":"10.1016\/j.cviu.2005.09.012"},{"key":"23_CR10","doi-asserted-by":"crossref","unstructured":"Felzenszwalb, P.F., Girshick, R.B., McAllester, D., Ramanan, D.: Object detection with discriminatively trained part-based models. PAMI (2010)","DOI":"10.1109\/TPAMI.2009.167"},{"key":"23_CR11","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"696","DOI":"10.1007\/978-3-540-88690-7_52","volume-title":"Computer Vision \u2013 ECCV 2008","author":"J.C. Gemert van","year":"2008","unstructured":"van Gemert, J.C., Geusebroek, J.-M., Veenman, C.J., Smeulders, A.W.M.: Kernel codebooks for scene categorization. In: Forsyth, D., Torr, P., Zisserman, A. (eds.) ECCV 2008, Part III. LNCS, vol.\u00a05304, pp. 696\u2013709. Springer, Heidelberg (2008)"},{"key":"23_CR12","doi-asserted-by":"crossref","unstructured":"Girshick, R., Donahue, J., Darrell, T., Malik, J.: Rich feature hierarchies for accurate object detection and semantic segmentation. In: CVPR (2014)","DOI":"10.1109\/CVPR.2014.81"},{"key":"23_CR13","doi-asserted-by":"crossref","unstructured":"Gong, Y., Wang, L., Guo, R., Lazebnik, S.: Multi-scale orderless pooling of deep convolutional activation features. ArXiv:1403.1840 (2014)","DOI":"10.1007\/978-3-319-10584-0_26"},{"key":"23_CR14","doi-asserted-by":"crossref","unstructured":"Grauman, K., Darrell, T.: The pyramid match kernel: Discriminative classification with sets of image features. In: ICCV (2005)","DOI":"10.1109\/ICCV.2005.239"},{"key":"23_CR15","unstructured":"Howard, A.G.: Some improvements on deep convolutional neural network based image classification. ArXiv:1312.5402 (2013)"},{"key":"23_CR16","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.: Imagenet classification with deep convolutional neural networks. In: NIPS (2012)"},{"key":"23_CR17","unstructured":"Lazebnik, S., Schmid, C., Ponce, J.: Beyond bags of features: Spatial pyramid matching for recognizing natural scene categories. In: CVPR (2006)"},{"key":"23_CR18","doi-asserted-by":"crossref","unstructured":"LeCun, Y., Boser, B., Denker, J.S., Henderson, D., Howard, R.E., Hubbard, W., Jackel, L.D.: Backpropagation applied to handwritten zip code recognition. Neural Computation (1989)","DOI":"10.1162\/neco.1989.1.4.541"},{"key":"23_CR19","doi-asserted-by":"crossref","unstructured":"Lowe, D.G.: Distinctive image features from scale-invariant keypoints. IJCV (2004)","DOI":"10.1023\/B:VISI.0000029664.99615.94"},{"key":"23_CR20","doi-asserted-by":"crossref","unstructured":"Oquab, M., Bottou, L., Laptev, I., Sivic, J., et al.: Learning and transferring mid-level image representations using convolutional neural networks. In: CVPR (2014)","DOI":"10.1109\/CVPR.2014.222"},{"key":"23_CR21","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"143","DOI":"10.1007\/978-3-642-15561-1_11","volume-title":"Computer Vision \u2013 ECCV 2010","author":"F. Perronnin","year":"2010","unstructured":"Perronnin, F., S\u00e1nchez, J., Mensink, T.: Improving the fisher kernel for large-scale image classification. In: Daniilidis, K., Maragos, P., Paragios, N. (eds.) ECCV 2010, Part IV. LNCS, vol.\u00a06314, pp. 143\u2013156. Springer, Heidelberg (2010)"},{"key":"23_CR22","doi-asserted-by":"crossref","unstructured":"Razavian, A.S., Azizpour, H., Sullivan, J., Carlsson, S.: Cnn features off-the-shelf: An astounding baseline for recogniton. In: CVPR 2014, DeepVision Workshop (2014)","DOI":"10.1109\/CVPRW.2014.131"},{"key":"23_CR23","doi-asserted-by":"crossref","unstructured":"van de Sande, K.E., Uijlings, J.R., Gevers, T., Smeulders, A.W.: Segmentation as selective search for object recognition. In: ICCV (2011)","DOI":"10.1109\/ICCV.2011.6126456"},{"key":"23_CR24","unstructured":"Sermanet, P., Eigen, D., Zhang, X., Mathieu, M., Fergus, R., LeCun, Y.: Overfeat: Integrated recognition, localization and detection using convolutional networks. ArXiv:1312.6229 (2013)"},{"key":"23_CR25","doi-asserted-by":"crossref","unstructured":"Sivic, J., Zisserman, A.: Video google: A text retrieval approach to object matching in videos. In: ICCV (2003)","DOI":"10.1109\/ICCV.2003.1238663"},{"key":"23_CR26","unstructured":"Szegedy, C., Toshev, A., Erhan, D.: Deep neural networks for object detection. In: NIPS (2013)"},{"key":"23_CR27","doi-asserted-by":"crossref","unstructured":"Taigman, Y., Yang, M., Ranzato, M., Wolf, L.: Deepface: Closing the gap to human-level performance in face verification. In: CVPR (2014)","DOI":"10.1109\/CVPR.2014.220"},{"key":"23_CR28","doi-asserted-by":"crossref","unstructured":"Wang, J., Yang, J., Yu, K., Lv, F., Huang, T., Gong, Y.: Locality-constrained linear coding for image classification. In: CVPR (2010)","DOI":"10.1109\/CVPR.2010.5540018"},{"key":"23_CR29","doi-asserted-by":"crossref","unstructured":"Wang, X., Yang, M., Zhu, S., Lin, Y.: Regionlets for generic object detection. In: ICCV (2013)","DOI":"10.1109\/ICCV.2013.10"},{"key":"23_CR30","unstructured":"Yang, J., Yu, K., Gong, Y., Huang, T.: Linear spatial pyramid matching using sparse coding for image classification. In: CVPR (2009)"},{"key":"23_CR31","unstructured":"Zeiler, M.D., Fergus, R.: Visualizing and understanding convolutional neural networks. ArXiv:1311.2901 (2013)"},{"key":"23_CR32","doi-asserted-by":"crossref","unstructured":"Zhang, N., Paluri, M., Ranzato, M., Darrell, T., Bourdevr, L.: Panda: Pose aligned networks for deep attribute modeling. In: CVPR (2014)","DOI":"10.1109\/CVPR.2014.212"},{"key":"23_CR33","doi-asserted-by":"crossref","unstructured":"Zou, W.Y., Wang, X., Sun, M., Lin, Y.: Generic object detection with dense neural patterns and regionlets. ArXiv:1404.4316 (2014)","DOI":"10.5244\/C.28.72"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2014"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-10578-9_23","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,12,2]],"date-time":"2019-12-02T10:10:13Z","timestamp":1575281413000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-319-10578-9_23"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2014]]},"ISBN":["9783319105772","9783319105789"],"references-count":33,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-10578-9_23","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2014]]}}}