{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,26]],"date-time":"2026-01-26T15:53:41Z","timestamp":1769442821558,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":36,"publisher":"ACM","license":[{"start":{"date-parts":[[2016,6,6]],"date-time":"2016-06-06T00:00:00Z","timestamp":1465171200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2016,6,6]]},"DOI":"10.1145\/2911996.2912001","type":"proceedings-article","created":{"date-parts":[[2016,6,10]],"date-time":"2016-06-10T13:09:57Z","timestamp":1465564197000},"page":"159-166","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":93,"title":["Action Recognition by Learning Deep Multi-Granular Spatio-Temporal Video Representation"],"prefix":"10.1145","author":[{"given":"Qing","family":"Li","sequence":"first","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"given":"Zhaofan","family":"Qiu","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"given":"Ting","family":"Yao","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, China"}]},{"given":"Tao","family":"Mei","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, China"}]},{"given":"Yong","family":"Rui","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, China"}]},{"given":"Jiebo","family":"Luo","sequence":"additional","affiliation":[{"name":"University of Rochester, Rochester, USA"}]}],"member":"320","published-online":{"date-parts":[[2016,6,6]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-24673-2_3"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/VSPETS.2005.1570899"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.21236\/ADA623249"},{"key":"e_1_3_2_1_4_1","volume-title":"ACCV","author":"Hoai M.","year":"2014","unstructured":"M. Hoai and A. Zisserman . Improving human action recognition using score distribution and ranking . In ACCV , 2014 . M. Hoai and A. Zisserman. Improving human action recognition using score distribution and ranking. In ACCV, 2014."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.330"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2012.59"},{"key":"e_1_3_2_1_8_1","volume-title":"Ca e: Convolutional architecture for fast feature embedding. arXiv preprint arXiv:1408.5093","author":"Jia Y.","year":"2014","unstructured":"Y. Jia , E. Shelhamer , J. Donahue , S. Karayev , J. Long , R. Girshick , S. Guadarrama , and T. Darrell . Ca e: Convolutional architecture for fast feature embedding. arXiv preprint arXiv:1408.5093 , 2014 . Y. Jia, E. Shelhamer, J. Donahue, S. Karayev, J. Long, R. Girshick, S. Guadarrama, and T. Darrell. Ca e: Convolutional architecture for fast feature embedding. arXiv preprint arXiv:1408.5093, 2014."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.223"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.5244\/C.22.99"},{"key":"e_1_3_2_1_11_1","volume-title":"NIPS","author":"Krizhevsky A.","year":"2012","unstructured":"A. Krizhevsky , I. Sutskever , and G. E. Hinton . Imagenet classification with deep convolutional neural networks . In NIPS , 2012 . A. Krizhevsky, I. Sutskever, and G. E. Hinton. Imagenet classification with deep convolutional neural networks. In NIPS, 2012."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"e_1_3_2_1_13_1","volume-title":"CVPR","author":"Lan Z.","year":"2015","unstructured":"Z. Lan , M. Lin , X. Li , A. G. Hauptmann , and B. Raj . Beyond gaussian pyramid: Multi-skip feature stacking for action recognition . In CVPR , 2015 . Z. Lan, M. Lin, X. Li, A. G. Hauptmann, and B. Raj. Beyond gaussian pyramid: Multi-skip feature stacking for action recognition. In CVPR, 2015."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.5555\/946247.946605"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2008.4587756"},{"key":"e_1_3_2_1_16_1","volume-title":"CVPR","author":"Ng J. Y.-H.","year":"2015","unstructured":"J. Y.-H. Ng , M. Hausknecht , S. Vijayanarasimhan , O. Vinyals , R. Monga , and G. Toderici . Beyond short snippets deep networks for video classi cation . In CVPR , 2015 . J. Y.-H. Ng, M. Hausknecht, S. Vijayanarasimhan, O. Vinyals, R. Monga, and G. Toderici. Beyond short snippets deep networks for video classi cation. In CVPR, 2015."},{"key":"e_1_3_2_1_17_1","volume-title":"Jointly modeling embedding and translation to bridge video and language. arXiv preprint arXiv:1505.01861v3","author":"Pan Y.","year":"2015","unstructured":"Y. Pan , T. Mei , T. Yao , H. Li , and Y. Rui . Jointly modeling embedding and translation to bridge video and language. arXiv preprint arXiv:1505.01861v3 , 2015 . Y. Pan, T. Mei, T. Yao, H. Li, and Y. Rui. Jointly modeling embedding and translation to bridge video and language. arXiv preprint arXiv:1505.01861v3, 2015."},{"key":"e_1_3_2_1_18_1","volume-title":"Bag of visual words and fusion methods for action recognition: Comprehensive study and good practice. arXiv preprint arXiv:1405.4506","author":"Peng X.","year":"2014","unstructured":"X. Peng , L. Wang , X. Wang , and Y. Qiao . Bag of visual words and fusion methods for action recognition: Comprehensive study and good practice. arXiv preprint arXiv:1405.4506 , 2014 . X. Peng, L. Wang, X. Wang, and Y. Qiao. Bag of visual words and fusion methods for action recognition: Comprehensive study and good practice. arXiv preprint arXiv:1405.4506, 2014."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.5555\/1888089.1888101"},{"key":"e_1_3_2_1_20_1","volume-title":"CVPR THUMOS Challenge Workshop","author":"Qiu Z.","year":"2015","unstructured":"Z. Qiu , Q. Li , T. Yao , T. Mei , and Y. Rui . Msrasiamsm at thumos challenge 2015 . In CVPR THUMOS Challenge Workshop , 2015 . Z. Qiu, Q. Li, T. Yao, T. Mei, and Y. Rui. Msrasiamsm at thumos challenge 2015. In CVPR THUMOS Challenge Workshop, 2015."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/1291233.1291311"},{"key":"e_1_3_2_1_22_1","volume-title":"NIPS","author":"Simonyan K.","year":"2013","unstructured":"K. Simonyan , A. Vedaldi , and A. Zisserman . Deep sher networks for large-scale image classi cation . In NIPS , 2013 . K. Simonyan, A. Vedaldi, and A. Zisserman. Deep sher networks for large-scale image classi cation. In NIPS, 2013."},{"key":"e_1_3_2_1_23_1","volume-title":"NIPS","author":"Simonyan K.","year":"2014","unstructured":"K. Simonyan and A. Zisserman . Two-stream convolutional networks for action recognition in videos . In NIPS , 2014 . K. Simonyan and A. Zisserman. Two-stream convolutional networks for action recognition in videos. In NIPS, 2014."},{"key":"e_1_3_2_1_24_1","volume-title":"ICLR","author":"Simonyan K.","year":"2015","unstructured":"K. Simonyan and A. Zisserman . Very deep convolutional networks for large-scale image recognition . In ICLR , 2015 . K. Simonyan and A. Zisserman. Very deep convolutional networks for large-scale image recognition. In ICLR, 2015."},{"key":"e_1_3_2_1_25_1","volume-title":"UCF101: A dataset of 101 human action classes from videos in the wild. CRCV-TR-12-01","author":"Soomro K.","year":"2012","unstructured":"K. Soomro , A. R. Zamir , and M. Shah . UCF101: A dataset of 101 human action classes from videos in the wild. CRCV-TR-12-01 , 2012 . K. Soomro, A. R. Zamir, and M. Shah. UCF101: A dataset of 101 human action classes from videos in the wild. CRCV-TR-12-01, 2012."},{"key":"e_1_3_2_1_26_1","volume-title":"ICML","author":"Srivastava N.","year":"2015","unstructured":"N. Srivastava , E. Mansimov , and R. Salakhutdinov . Unsupervised learning of video representations using lstms . In ICML , 2015 . N. Srivastava, E. Mansimov, and R. Salakhutdinov. Unsupervised learning of video representations using lstms. In ICML, 2015."},{"key":"e_1_3_2_1_27_1","volume-title":"Learning spatiotemporal features with 3d convolutional networks. arXiv preprint arXiv:1412.0767","author":"Tran D.","year":"2014","unstructured":"D. Tran , L. D. Bourdev , R. Fergus , L. Torresani , and M. Paluri . Learning spatiotemporal features with 3d convolutional networks. arXiv preprint arXiv:1412.0767 , 2014 . D. Tran, L. D. Bourdev, R. Fergus, L. Torresani, and M. Paluri. Learning spatiotemporal features with 3d convolutional networks. arXiv preprint arXiv:1412.0767, 2014."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2011.5995407"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.441"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299059"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-88688-4_48"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2654931"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/2502081.2502085"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2006.313037"},{"key":"e_1_3_2_1_35_1","volume-title":"Exploiting image-trained CNN architectures for unconstrained video classification. arXiv preprint arXiv:1503.04144","author":"Zha S.","year":"2015","unstructured":"S. Zha , F. Luisier , W. Andrews , N. Srivastava , and R. Salakhutdinov . Exploiting image-trained CNN architectures for unconstrained video classification. arXiv preprint arXiv:1503.04144 , 2015 . S. Zha, F. Luisier, W. Andrews, N. Srivastava, and R. Salakhutdinov. Exploiting image-trained CNN architectures for unconstrained video classification. arXiv preprint arXiv:1503.04144, 2015."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/1290082.1290114"}],"event":{"name":"ICMR'16: International Conference on Multimedia Retrieval","location":"New York New York USA","acronym":"ICMR'16","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2016 ACM on International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/2911996.2912001","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/2911996.2912001","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:56:07Z","timestamp":1750222567000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/2911996.2912001"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016,6,6]]},"references-count":36,"alternative-id":["10.1145\/2911996.2912001","10.1145\/2911996"],"URL":"https:\/\/doi.org\/10.1145\/2911996.2912001","relation":{},"subject":[],"published":{"date-parts":[[2016,6,6]]},"assertion":[{"value":"2016-06-06","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}