{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,22]],"date-time":"2026-01-22T23:47:32Z","timestamp":1769125652190,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":71,"publisher":"ACM","license":[{"start":{"date-parts":[[2016,10,1]],"date-time":"2016-10-01T00:00:00Z","timestamp":1475280000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"China's National 863 Program","award":["2014AA015101"],"award-info":[{"award-number":["2014AA015101"]}]},{"name":"NSF China","award":["61572138"],"award-info":[{"award-number":["61572138"]}]},{"name":"National Natural Science","award":["U1509206"],"award-info":[{"award-number":["U1509206"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2016,10]]},"DOI":"10.1145\/2964284.2964328","type":"proceedings-article","created":{"date-parts":[[2016,9,29]],"date-time":"2016-09-29T19:17:32Z","timestamp":1475176652000},"page":"791-800","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":133,"title":["Multi-Stream Multi-Class Fusion of Deep Networks for Video Classification"],"prefix":"10.1145","author":[{"given":"Zuxuan","family":"Wu","sequence":"first","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"given":"Yu-Gang","family":"Jiang","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"given":"Xi","family":"Wang","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"given":"Hao","family":"Ye","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"given":"Xiangyang","family":"Xue","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2016,10]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"INTERSPEECH","author":"Abdel-Hamid O.","year":"2013","unstructured":"O. Abdel-Hamid , L. Deng , and D. Yu . Exploring convolutional neural network structures and optimization techniques for speech recognition . In INTERSPEECH , 2013 . O. Abdel-Hamid, L. Deng, and D. Yu. Exploring convolutional neural network structures and optimization techniques for speech recognition. In INTERSPEECH, 2013."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.324"},{"key":"e_1_3_2_1_3_1","volume-title":"Optimization for Machine Learning","author":"Bach F.","year":"2011","unstructured":"F. Bach , R. Jenatton , J. Mairal , G. Obozinski , Convex optimization with sparsity-inducing norms . Optimization for Machine Learning , 2011 . F. Bach, R. Jenatton, J. Mairal, G. Obozinski, et al. Convex optimization with sparsity-inducing norms. Optimization for Machine Learning, 2011."},{"key":"e_1_3_2_1_4_1","volume-title":"CoRR","author":"Bengio S.","year":"2013","unstructured":"S. Bengio , J. Dean , D. Erhan , E. Ie , Q. Le , A. Rabinovich , J. Shlens , and Y. Singer . Using web co-occurrence statistics for improving image categorization . CoRR , 2013 . S. Bengio, J. Dean, D. Erhan, E. Ie, Q. Le, A. Rabinovich, J. Shlens, and Y. Singer. Using web co-occurrence statistics for improving image categorization. CoRR, 2013."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-24673-2_3"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.168"},{"key":"e_1_3_2_1_7_1","volume-title":"CoRR","author":"Chung J.","year":"2015","unstructured":"J. Chung , \u00c7. G\u00fcl\u00e7ehre, K. Cho , and Y. Bengio . Gated feedback recurrent neural networks . CoRR , 2015 . J. Chung, \u00c7. G\u00fcl\u00e7ehre, K. Cho, and Y. Bengio. Gated feedback recurrent neural networks. CoRR, 2015."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10590-1_4"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"e_1_3_2_1_10_1","author":"Donoho D. L.","year":"1995","unstructured":"D. L. Donoho and I. M. Johnstone . Adapting to unknown smoothness via wavelet shrinkage. Journal of the american statistical association , 1995 . D. L. Donoho and I. M. Johnstone. Adapting to unknown smoothness via wavelet shrinkage. Journal of the american statistical association, 1995.","journal-title":"Journal of the american statistical association"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2012.231"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299176"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2009.5459169"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.81"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6638947"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2005.06.042"},{"key":"e_1_3_2_1_17_1","volume-title":"CoRR","author":"Hinton G.","year":"2015","unstructured":"G. Hinton , O. Vinyals , and J. Dean . Distilling the knowledge in a neural network . CoRR , 2015 . G. Hinton, O. Vinyals, and J. Dean. Distilling the knowledge in a neural network. CoRR, 2015."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"e_1_3_2_1_19_1","volume-title":"The thumos challenge on action recognition for videos\" in the wild\". arXiv preprint arXiv:1604.06182","author":"Idrees H.","year":"2016","unstructured":"H. Idrees , A. R. Zamir , Y.-G. Jiang , A. Gorban , I. Laptev , R. Sukthankar , and M. Shah . The thumos challenge on action recognition for videos\" in the wild\". arXiv preprint arXiv:1604.06182 , 2016 . H. Idrees, A. R. Zamir, Y.-G. Jiang, A. Gorban, I. Laptev, R. Sukthankar, and M. Shah. The thumos challenge on action recognition for videos\" in the wild\". arXiv preprint arXiv:1604.06182, 2016."},{"key":"e_1_3_2_1_20_1","volume-title":"CVPR THUMOS Workshop","author":"Jain M.","year":"2015","unstructured":"M. Jain and et al. University of amsterdam at thumos 2015 . In CVPR THUMOS Workshop , 2015 . M. Jain and et al. University of amsterdam at thumos 2015. In CVPR THUMOS Workshop, 2015."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00138-013-0567-0"},{"key":"e_1_3_2_1_22_1","volume-title":"ICML","author":"Ji S.","year":"2010","unstructured":"S. Ji , W. Xu , M. Yang , and K. Yu . 3d convolutional neural networks for human action recognition . In ICML , 2010 . S. Ji, W. Xu, M. Yang, and K. Yu. 3d convolutional neural networks for human action recognition. In ICML, 2010."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2654889"},{"key":"e_1_3_2_1_24_1","volume-title":"ICCV","author":"Jiang Y.-G.","year":"2009","unstructured":"Y.-G. Jiang , J. Wang , S.-F. Chang , and C.-W. Ngo . Domain adaptive semantic diffusion for large scale context-based video annotation . In ICCV , 2009 . Y.-G. Jiang, J. Wang, S.-F. Chang, and C.-W. Ngo. Domain adaptive semantic diffusion for large scale context-based video annotation. In ICCV, 2009."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/1991996.1992025"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.223"},{"key":"e_1_3_2_1_27_1","author":"Kloft M.","year":"2011","unstructured":"M. Kloft , U. Brefeld , S. Sonnenburg , and A. Zien . Lp-norm multiple kernel learning. The Journal of Machine Learning Research , 2011 . M. Kloft, U. Brefeld, S. Sonnenburg, and A. Zien. Lp-norm multiple kernel learning. The Journal of Machine Learning Research, 2011.","journal-title":"Lp-norm multiple kernel learning. The Journal of Machine Learning Research"},{"key":"e_1_3_2_1_28_1","volume-title":"NIPS","author":"Krizhevsky A.","year":"2012","unstructured":"A. Krizhevsky , I. Sutskever , and G. E. Hinton . Imagenet classification with deep convolutional neural networks . In NIPS , 2012 . A. Krizhevsky, I. Sutskever, and G. E. Hinton. Imagenet classification with deep convolutional neural networks. In NIPS, 2012."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.288"},{"key":"e_1_3_2_1_30_1","volume-title":"CoRR","author":"Lan Z.","year":"2014","unstructured":"Z. Lan , M. Lin , X. Li , A. G. Hauptmann , and B. Raj . Beyond gaussian pyramid: Multi-skip feature stacking for action recognition . CoRR , 2014 . Z. Lan, M. Lin, X. Li, A. G. Hauptmann, and B. Raj. Beyond gaussian pyramid: Multi-skip feature stacking for action recognition. CoRR, 2014."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-005-1838-7"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.109"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1023\/B:VISI.0000029664.99615.94"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-014-0723-7"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.5244\/C.29.178"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2007.70796"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.5555\/2354409.2354988"},{"key":"e_1_3_2_1_38_1","volume-title":"IEEE TPAMI","author":"Neverova N.","year":"2014","unstructured":"N. Neverova , C. Wolf , G. Taylor , and F. Nebout . Moddrop: adaptive multi-modal gesture recognition . IEEE TPAMI , 2014 . N. Neverova, C. Wolf, G. Taylor, and F. Nebout. Moddrop: adaptive multi-modal gesture recognition. IEEE TPAMI, 2014."},{"key":"e_1_3_2_1_39_1","volume-title":"CVPR","author":"Ng J. Y.-H.","year":"2015","unstructured":"J. Y.-H. Ng , M. Hausknecht , S. Vijayanarasimhan , O. Vinyals , R. Monga , and G. Toderici . Beyond short snippets: Deep networks for video classification . In CVPR , 2015 . J. Y.-H. Ng, M. Hausknecht, S. Vijayanarasimhan, O. Vinyals, R. Monga, and G. Toderici. Beyond short snippets: Deep networks for video classification. In CVPR, 2015."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.228"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2007.4408986"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.508"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-013-0636-x"},{"key":"e_1_3_2_1_44_1","volume-title":"CoRR","author":"Sharma S.","year":"2015","unstructured":"S. Sharma , R. Kiros , and R. Salakhutdinov . Action recognition using visual attention . CoRR , 2015 . S. Sharma, R. Kiros, and R. Salakhutdinov. Action recognition using visual attention. CoRR, 2015."},{"key":"e_1_3_2_1_45_1","volume-title":"NIPS","author":"Simonyan K.","year":"2014","unstructured":"K. Simonyan and A. Zisserman . Two-stream convolutional networks for action recognition in videos . In NIPS , 2014 . K. Simonyan and A. Zisserman. Two-stream convolutional networks for action recognition in videos. In NIPS, 2014."},{"key":"e_1_3_2_1_46_1","volume-title":"ICLR","author":"Simonyan K.","year":"2015","unstructured":"K. Simonyan and A. Zisserman . Very deep convolutional networks for large-scale image recognition . In ICLR , 2015 . K. Simonyan and A. Zisserman. Very deep convolutional networks for large-scale image recognition. In ICLR, 2015."},{"key":"e_1_3_2_1_47_1","volume-title":"CoRR","author":"Soomro K.","year":"2012","unstructured":"K. Soomro , A. R. Zamir , and M. Shah . UCF101: A dataset of 101 human actions classes from videos in the wild . CoRR , 2012 . K. Soomro, A. R. Zamir, and M. Shah. UCF101: A dataset of 101 human actions classes from videos in the wild. CoRR, 2012."},{"key":"e_1_3_2_1_48_1","volume-title":"ICML","author":"Srivastava N.","year":"2015","unstructured":"N. Srivastava , E. Mansimov , and R. Salakhutdinov . Unsupervised learning of video representations using LSTMs . In ICML , 2015 . N. Srivastava, E. Mansimov, and R. Salakhutdinov. Unsupervised learning of video representations using LSTMs. In ICML, 2015."},{"key":"e_1_3_2_1_49_1","volume-title":"NIPS","author":"Srivastava N.","year":"2012","unstructured":"N. Srivastava and R. Salakhutdinov . Multimodal learning with deep boltzmann machines . In NIPS , 2012 . N. Srivastava and R. Salakhutdinov. Multimodal learning with deep boltzmann machines. In NIPS, 2012."},{"key":"e_1_3_2_1_50_1","volume-title":"CVPR","author":"Sun X.","year":"2009","unstructured":"X. Sun , M. Chen , and A. Hauptmann . Action recognition via local descriptors and holistic features . In CVPR , 2009 . X. Sun, M. Chen, and A. Hauptmann. Action recognition via local descriptors and holistic features. In CVPR, 2009."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.5555\/2354409.2354961"},{"key":"e_1_3_2_1_53_1","volume-title":"CoRR","author":"Tran D.","year":"2014","unstructured":"D. Tran , L. Bourdev , R. Fergus , L. Torresani , and M. Paluri . C3d: Generic features for video analysis . CoRR , 2014 . D. Tran, L. Bourdev, R. Fergus, L. Torresani, and M. Paluri. C3d: Generic features for video analysis. CoRR, 2014."},{"key":"e_1_3_2_1_54_1","volume-title":"NIPS","author":"Van den Oord A.","year":"2013","unstructured":"A. Van den Oord , S. Dieleman , and B. Schrauwen . Deep content-based music recommendation . In NIPS , 2013 . A. Van den Oord, S. Dieleman, and B. Schrauwen. Deep content-based music recommendation. In NIPS, 2013."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.460"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.441"},{"key":"e_1_3_2_1_57_1","volume-title":"ICCV THUMOS Workshop","author":"Wang H.","year":"2013","unstructured":"H. Wang and C. Schmid . Lear-inria submission for the thumos workshop . In ICCV THUMOS Workshop , 2013 . H. Wang and C. Schmid. Lear-inria submission for the thumos workshop. In ICCV THUMOS Workshop, 2013."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.5244\/C.23.124"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299059"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.291"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206709"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.339"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2654931"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1145\/2733373.2806222"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298789"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.427"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.512"},{"key":"e_1_3_2_1_68_1","volume-title":"CVPR","author":"Ye G.","year":"2012","unstructured":"G. Ye , D. Liu , I.-H. Jhuo , and S.-F. Chang . Robust late fusion with rank minimization . In CVPR , 2012 . G. Ye, D. Liu, I.-H. Jhuo, and S.-F. Chang. Robust late fusion with rank minimization. In CVPR, 2012."},{"key":"e_1_3_2_1_69_1","volume-title":"NIST TRECVID Video Retrieval Evaluation Workshop","author":"Yu S.-I.","year":"2014","unstructured":"S.-I. Yu , L. Jiang, and et al. Informedia@ trecvid 2014 med and mer . In NIST TRECVID Video Retrieval Evaluation Workshop , 2014 . S.-I. Yu, L. Jiang, and et al. Informedia@ trecvid 2014 med and mer. In NIST TRECVID Video Retrieval Evaluation Workshop, 2014."},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.5244\/C.29.60"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2964308"}],"event":{"name":"MM '16: ACM Multimedia Conference","location":"Amsterdam The Netherlands","acronym":"MM '16","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 24th ACM international conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/2964284.2964328","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/2964284.2964328","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T19:04:32Z","timestamp":1750273472000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/2964284.2964328"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016,10]]},"references-count":71,"alternative-id":["10.1145\/2964284.2964328","10.1145\/2964284"],"URL":"https:\/\/doi.org\/10.1145\/2964284.2964328","relation":{},"subject":[],"published":{"date-parts":[[2016,10]]},"assertion":[{"value":"2016-10-01","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}