{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,20]],"date-time":"2025-12-20T22:20:29Z","timestamp":1766269229971,"version":"3.40.3"},"publisher-location":"Cham","reference-count":54,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783319466033"},{"type":"electronic","value":"9783319466040"}],"license":[{"start":{"date-parts":[[2016,1,1]],"date-time":"2016-01-01T00:00:00Z","timestamp":1451606400000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2016,1,1]],"date-time":"2016-01-01T00:00:00Z","timestamp":1451606400000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2016]]},"DOI":"10.1007\/978-3-319-46604-0_47","type":"book-chapter","created":{"date-parts":[[2016,9,17]],"date-time":"2016-09-17T03:31:55Z","timestamp":1474083115000},"page":"668-684","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":18,"title":["Depth2Action: Exploring Embedded Depth for Large-Scale Action Recognition"],"prefix":"10.1007","author":[{"given":"Yi","family":"Zhu","sequence":"first","affiliation":[]},{"given":"Shawn","family":"Newsam","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2016,9,18]]},"reference":[{"key":"47_CR1","doi-asserted-by":"crossref","unstructured":"Baig, M.H., Torresani, L.: Coupled Depth Learning. In: WACV (2016)","DOI":"10.1109\/WACV.2016.7477699"},{"key":"47_CR2","unstructured":"Ballas, N., Yao, L., Pal, C., Courville, A.: Delving deeper into convolutional networks for learning video representations. In: ICLR (2016)"},{"key":"47_CR3","doi-asserted-by":"publisher","first-page":"155","DOI":"10.1007\/s11554-013-0370-1","volume":"12","author":"C Chen","year":"2013","unstructured":"Chen, C., Liu, K., Kehtarnavaz, N.: Real-time human action recognition based on depth motion maps. J. Real-Time Image Process. 12, 155\u2013163 (2013)","journal-title":"J. Real-Time Image Process."},{"key":"47_CR4","doi-asserted-by":"crossref","unstructured":"Eigen, D., Fergus, R.: Predicting depth, surface normals and semantic labels with a common multi-scale convolutional architecture. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.304"},{"key":"47_CR5","first-page":"1871","volume":"9","author":"RE Fan","year":"2008","unstructured":"Fan, R.E., Chang, K.W., Hsieh, C.J., Wang, X.R., Lin, C.J.: LIBLINEAR: a library for large linear classification. J. Mach. Learn. Res. 9, 1871\u20131874 (2008)","journal-title":"J. Mach. Learn. Res."},{"key":"47_CR6","doi-asserted-by":"crossref","unstructured":"Fernando, B., Gavves, E., M., J.O., Ghodrati, A., Tuytelaars, T.: Modeling video evolution for action recognition. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7299176"},{"key":"47_CR7","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"346","DOI":"10.1007\/978-3-319-10578-9_23","volume-title":"Computer Vision \u2013 ECCV 2014","author":"K He","year":"2014","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Spatial pyramid pooling in deep convolutional networks for visual recognition. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8691, pp. 346\u2013361. Springer, Heidelberg (2014). doi:\n                      10.1007\/978-3-319-10578-9_23"},{"key":"47_CR8","doi-asserted-by":"crossref","unstructured":"Heilbron, F.C., Escorcia, V., Ghanem, B., Niebles, J.C.: ActivityNet: a large-scale video benchmark for human activity understanding. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"47_CR9","doi-asserted-by":"crossref","unstructured":"Jain, M., van Gemert, J.C., Snoek, C.G.M.: What do 15,000 object categories tell us about classifying and localizing actions? In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298599"},{"key":"47_CR10","doi-asserted-by":"publisher","first-page":"1704","DOI":"10.1109\/TPAMI.2011.235","volume":"34","author":"H Jegou","year":"2012","unstructured":"Jegou, H., Perronnin, F., Douze, M., Sanchez, J., Perez, P., Schmid, C.: Aggregating local image descriptors into compact codes. TPAMI 34, 1704\u20131716 (2012)","journal-title":"TPAMI"},{"key":"47_CR11","doi-asserted-by":"publisher","first-page":"221","DOI":"10.1109\/TPAMI.2012.59","volume":"35","author":"S Ji","year":"2012","unstructured":"Ji, S., Xu, W., Yang, M., Yu, K.: 3D convolutional neural networks for human action recognition. TPAMI 35, 221\u2013231 (2012)","journal-title":"TPAMI"},{"key":"47_CR12","doi-asserted-by":"crossref","unstructured":"Jia, Y., Shelhamer, E., Donahue, J., Karayev, S., Long, J., Girshick, R., Guadarrama, S., Darrell, T.: Caffe: convolutional architecture for fast feature embedding. arXiv preprint \n                      arXiv:1408.5093\n                      \n                     (2014)","DOI":"10.1145\/2647868.2654889"},{"key":"47_CR13","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Toderici, G., Shetty, S., Leung, T., Sukthankar, R., Fei-Fei, L.: Large-scale video classification with convolutional neural networks. In: CVPR (2014)","DOI":"10.1109\/CVPR.2014.223"},{"key":"47_CR14","doi-asserted-by":"crossref","unstructured":"Kong, N., Black, M.J.: Intrinsic depth: improving depth transfer with intrinsic images. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.401"},{"key":"47_CR15","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: Imagenet classification with deep convolutional neural networks. In: NIPS (2012)"},{"key":"47_CR16","doi-asserted-by":"crossref","unstructured":"Kuehne, H., Jhuang, H., Garrote, E., Poggio, T., Serre, T.: HMDB: a large video database for human motion recognition. In: ICCV (2011)","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"47_CR17","unstructured":"Lan, Z., Lin, M., Li, X., Hauptmann, A.G., Raj, B.: Beyond gaussian pyramid: multi-skip feature stacking for action recognition. In: CVPR (2015)"},{"key":"47_CR18","unstructured":"Lan, Z., Yu, S.I., Hauptmann, A.G.: Improving human activity recognition through ranking and re-ranking. arXiv preprint \n                      arXiv:1512.03740\n                      \n                     (2015)"},{"key":"47_CR19","doi-asserted-by":"crossref","unstructured":"Li, W., Zhang, Z., Liu, Z.: Action recognition based on a bag of 3D points. In: CVPR (2010)","DOI":"10.1109\/CVPRW.2010.5543273"},{"key":"47_CR20","doi-asserted-by":"crossref","unstructured":"Liu, F., Shen, C., Lin, G.: Deep convolutional neural fields for depth estimation from a single image. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7299152"},{"key":"47_CR21","doi-asserted-by":"crossref","unstructured":"Liu, L., Zhou, Y., Shao, L.: DAP3D-Net: where, what and how actions occur in videos? arXiv preprint \n                      arXiv:1602.03346\n                      \n                     (2016)","DOI":"10.1109\/ICRA.2017.7989018"},{"key":"47_CR22","unstructured":"Liu, M., Salzmann, M., He, X.: Structured depth prediction in challenging monocular video sequences. arXiv preprint \n                      arXiv:1511.06070\n                      \n                     (2015)"},{"key":"47_CR23","unstructured":"Ma, S., Bargal, S.A., Zhang, J., Sigal, L., Sclaroff, S.: Do less and achieve more: training CNNs for action recognition utilizing action images from the web. arXiv preprint \n                      arXiv:1512.07155\n                      \n                     (2015)"},{"key":"47_CR24","unstructured":"Ng, J.Y.H., Hausknecht, M., Vijayanarasimhan, S., Vinyals, O., Monga, R., Toderici, G.: Beyond short snippets: deep networks for video classification. In: CVPR (2015)"},{"key":"47_CR25","doi-asserted-by":"crossref","unstructured":"Oneata, D., Verbeek, J., Schmid, C.: Action and event recognition with fisher vectors on a compact feature Set. In: ICCV (2013)","DOI":"10.1109\/ICCV.2013.228"},{"key":"47_CR26","unstructured":"Oneata, D., Verbeek, J., Schmid, C.: The LEAR submission at THUMOS 2014 (2014)"},{"key":"47_CR27","doi-asserted-by":"crossref","unstructured":"Oreifej, O., Liu, Z.: HON4D: histogram of oriented 4D normals for activity recognition from depth sequences. In: CVPR (2013)","DOI":"10.1109\/CVPR.2013.98"},{"key":"47_CR28","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"581","DOI":"10.1007\/978-3-319-10602-1_38","volume-title":"Computer Vision \u2013 ECCV 2014","author":"X Peng","year":"2014","unstructured":"Peng, X., Zou, C., Qiao, Y., Peng, Q.: Action recognition with stacked fisher vectors. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 581\u2013595. Springer, Heidelberg (2014). doi:\n                      10.1007\/978-3-319-10602-1_38"},{"key":"47_CR29","doi-asserted-by":"crossref","unstructured":"Raza, S.H., Javed, O., Das, A., Sawhney, H., Cheng, H., Essa, I.: Depth extraction from videos using geometric context and occlusion boundaries. In: BMVC (2014)","DOI":"10.5244\/C.28.10"},{"key":"47_CR30","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., Deng, J., Su, H., Krause, J., Satheesh, S., Ma, S., Huang, Z., Karpathy, A., Khosla, A., Bernstein, M., Berg, A.C., Fei-Fei, L.: Imagenet large scale visual recognition challenge. IJCV 115, 211\u2013252 (2015)","journal-title":"IJCV"},{"key":"47_CR31","unstructured":"Simonyan, K., Zisserman, A.: Two-stream convolutional networks for action recognition in videos. In: NIPS (2014)"},{"key":"47_CR32","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. In: ICLR (2015)"},{"key":"47_CR33","unstructured":"Soomro, K., Zamir, A.R., Shah, M.: UCF101: a dataset of 101 human action classes from videos in the wild. In: CRCV-TR-12-01 (2012)"},{"key":"47_CR34","doi-asserted-by":"crossref","unstructured":"Sourimant, G.: A simple and efficient way to compute depth maps for multi-view videos. In: 3DTV-Conference (2010)","DOI":"10.1109\/3DTV.2010.5506333"},{"key":"47_CR35","unstructured":"Srivastava, N., Mansimov, E., Salakhutdinov, R.: Unsupervised learning of video representations using LSTMs. In: ICML (2015)"},{"key":"47_CR36","doi-asserted-by":"crossref","unstructured":"Sun, L., Jia, K., Yeung, D.Y., Shi, B.E.: Human action recognition using factorized spatio-temporal convolutional networks. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.522"},{"key":"47_CR37","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L., Fergus, R., Torresani, L., Paluri, M.: Learning spatiotemporal features with 3D convolutional networks. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.510"},{"key":"47_CR38","doi-asserted-by":"publisher","first-page":"221","DOI":"10.1016\/j.patrec.2013.07.011","volume":"36","author":"AW Vieiraa","year":"2014","unstructured":"Vieiraa, A.W., Nascimentoa, E.R., Oliveiraa, G.L., Liuc, Z., Campos, M.F.: On the improvement of human action recognition from depth map sequences using space time occupancy patterns. Pattern Recogn. Lett. 36, 221\u2013227 (2014)","journal-title":"Pattern Recogn. Lett."},{"key":"47_CR39","doi-asserted-by":"crossref","unstructured":"Wang, H., Schmid, C.: Action recognition with improved trajectories. In: ICCV (2013)","DOI":"10.1109\/ICCV.2013.441"},{"key":"47_CR40","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-04561-0","volume-title":"Human Action Recognition with Depth Cameras","author":"J Wang","year":"2014","unstructured":"Wang, J., Liu, Z., Wu, Y.: Human Action Recognition with Depth Cameras. Springer, Heidelberg (2014)"},{"key":"47_CR41","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"872","DOI":"10.1007\/978-3-642-33709-3_62","volume-title":"Computer Vision \u2013 ECCV 2012","author":"J Wang","year":"2012","unstructured":"Wang, J., Liu, Z., Chorowski, J., Chen, Z., Wu, Y.: Robust 3D action recognition with random occupancy patterns. In: Fitzgibbon, A., Lazebnik, S., Perona, P., Sato, Y., Schmid, C. (eds.) ECCV 2012. LNCS, vol. 7573, pp. 872\u2013885. Springer, Heidelberg (2012)"},{"key":"47_CR42","unstructured":"Wang, J., Liu, Z., Wu, Y., Yuan, J.: Mining actionlet ensemble for action recognition with depth cameras. In: CVPR (2012)"},{"key":"47_CR43","doi-asserted-by":"crossref","unstructured":"Wang, L., Qiao, Y., Tang, X.: Action recognition with trajectory-pooled deep-convolutional descriptors. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7299059"},{"key":"47_CR44","unstructured":"Wang, L., Xiong, Y., Wang, Z., Qiao, Y.: Towards good practices for very deep two-stream convNets. arXiv preprint \n                      arXiv:1507.02159\n                      \n                     (2015)"},{"key":"47_CR45","doi-asserted-by":"crossref","unstructured":"Wang, P., Li, W., Gao, Z., Tang, C., Zhang, J., Ogunbona, P.: Convnets-based action recognition from depth maps through virtual cameras and pseudocoloring. In: ACM MM (2015)","DOI":"10.1145\/2733373.2806296"},{"key":"47_CR46","doi-asserted-by":"crossref","unstructured":"Wang, X., Farhadi, A., Gupta, A.: Actions transformations. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.291"},{"key":"47_CR47","doi-asserted-by":"crossref","unstructured":"Wu, Z., Wang, X., Jiang, Y.G., Ye, H., Xue, X.: Modeling spatial-temporal clues in a hybrid deep learning framework for video classification. In: ACM MM (2015)","DOI":"10.1145\/2733373.2806222"},{"key":"47_CR48","doi-asserted-by":"crossref","unstructured":"Xu, Z., Yang, Y., Hauptmann, A.G.: A discriminative CNN video representation for event detection. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298789"},{"key":"47_CR49","doi-asserted-by":"crossref","unstructured":"Yang, X., Zhang, C., Tian, Y.: Recognizing actions using depth motion maps-based histograms of oriented gradients. In: ACM MM (2012)","DOI":"10.1145\/2393347.2396382"},{"key":"47_CR50","doi-asserted-by":"crossref","unstructured":"Yang, X., Tian, Y.: Super normal vector for activity recognition using depth sequences. In: CVPR (2014)","DOI":"10.1109\/CVPR.2014.108"},{"key":"47_CR51","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"50","DOI":"10.1007\/978-3-319-16814-2_4","volume-title":"Computer Vision \u2013 ACCV 2014","author":"G Yu","year":"2015","unstructured":"Yu, G., Liu, Z., Yuan, J.: Discriminative orderlet mining for real-time recognition of human-object interaction. In: Cremers, D., Reid, I., Saito, H., Yang, M.-H. (eds.) ACCV 2014. LNCS, vol. 9007, pp. 50\u201365. Springer, Heidelberg (2015). doi:\n                      10.1007\/978-3-319-16814-2_4"},{"key":"47_CR52","doi-asserted-by":"crossref","unstructured":"Zha, S., Luisier, F., Andrews, W., Srivastava, N., Salakhutdinov, R.: Exploiting image-trained CNN architectures for unconstrained video classification. In: BMVC (2015)","DOI":"10.5244\/C.29.60"},{"key":"47_CR53","doi-asserted-by":"publisher","first-page":"974","DOI":"10.1109\/TPAMI.2009.52","volume":"31","author":"G Zhang","year":"2009","unstructured":"Zhang, G., Jia, J., Wong, T.T., Bao, H.: Consistent depth maps recovery from a video sequence. TPAMI 31, 974\u2013988 (2009)","journal-title":"TPAMI"},{"key":"47_CR54","unstructured":"Zhao, S., Liu, Y., Han, Y., Hong, R.: Pooling the convolutional layers in deep convnets for action recognition. arXiv preprint \n                      arXiv:1511.02126\n                      \n                     (2015)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2016 Workshops"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-46604-0_47","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,10,10]],"date-time":"2020-10-10T00:57:14Z","timestamp":1602291434000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-319-46604-0_47"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016]]},"ISBN":["9783319466033","9783319466040"],"references-count":54,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-46604-0_47","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2016]]},"assertion":[{"value":"18 September 2016","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Amsterdam","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"The Netherlands","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2016","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 October 2016","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16 October 2016","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2016","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/www.eccv2016.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}