{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,7]],"date-time":"2026-05-07T16:24:39Z","timestamp":1778171079421,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","license":[{"start":{"date-parts":[[2020,10,12]],"date-time":"2020-10-12T00:00:00Z","timestamp":1602460800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"National Key Research and Development Program of China","award":["No. 2018AAA0102501"],"award-info":[{"award-number":["No. 2018AAA0102501"]}]},{"name":"National Natural Science Foundation of China","award":["No. 61876149"],"award-info":[{"award-number":["No. 61876149"]}]},{"name":"Samsung R&D Institute China Xi'an (SRCX)"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2020,10,12]]},"DOI":"10.1145\/3394171.3413641","type":"proceedings-article","created":{"date-parts":[[2020,10,12]],"date-time":"2020-10-12T13:10:18Z","timestamp":1602508218000},"page":"2039-2047","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":44,"title":["A Slow-I-Fast-P Architecture for Compressed Video Action Recognition"],"prefix":"10.1145","author":[{"given":"Jiapeng","family":"Li","sequence":"first","affiliation":[{"name":"Xi'an Jiaotong University, Xi'an, China"}]},{"given":"Ping","family":"Wei","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong University, Xi'an, China"}]},{"given":"Yongchi","family":"Zhang","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong University, Xi'an, China"}]},{"given":"Nanning","family":"Zheng","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong University, Xi'an, China"}]}],"member":"320","published-online":{"date-parts":[[2020,10,12]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-33783-3_44"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_22"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00807"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1113\/jphysiol.1984.sp015498"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.316"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00630"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"e_1_3_2_2_9_1","unstructured":"Christoph Feichtenhofer Axel Pinz and Richard Wildes. 2016a. Spatiotemporal residual networks for video action recognition. In Advances in neural information processing systems. 3468--3476.  Christoph Feichtenhofer Axel Pinz and Richard Wildes. 2016a. Spatiotemporal residual networks for video action recognition. In Advances in neural information processing systems. 3468--3476."},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.787"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.213"},{"key":"e_1_3_2_2_12_1","volume-title":"NY: 1991","volume":"1","author":"Felleman Daniel J","year":"1991"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00622"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1117\/12.965761"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1152\/jn.1965.28.2.229"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.179"},{"key":"e_1_3_2_2_18_1","volume-title":"European Conference on Computer Vision. Springer, 3--10","author":"Jason J Yu","year":"2016"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00209"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00521-019-04615-w"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/103085.103090"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01249-6_24"},{"key":"e_1_3_2_2_24_1","volume-title":"Science","volume":"240","author":"Livingstone Margaret","year":"1988"},{"key":"e_1_3_2_2_25_1","volume-title":"Proc. icml","volume":"30","author":"Maas Andrew L","year":"2013"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12276"},{"key":"e_1_3_2_2_27_1","volume-title":"Remote Sensing & Spatial Information Sciences","volume":"2","author":"Menze Moritz","year":"2015"},{"key":"e_1_3_2_2_28_1","volume-title":"2018 IEEE Winter Conference on Applications of Computer Vision (WACV). IEEE, 1616--1624","author":"Yue-Hei Ng Joe","year":"2018"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.590"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.291"},{"key":"e_1_3_2_2_31_1","volume-title":"German Conference on Pattern Recognition. Springer, 281--297","author":"Sevilla-Lara Laura","year":"2018"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00136"},{"key":"e_1_3_2_2_33_1","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Two-stream convolutional networks for action recognition in videos. In Advances in neural information processing systems. 568--576.  Karen Simonyan and Andrew Zisserman. 2014. Two-stream convolutional networks for action recognition in videos. In Advances in neural information processing systems. 568--576."},{"key":"e_1_3_2_2_34_1","volume-title":"Proceedings of the Neural Information Processing Systems (NIPS).","author":"Simonyan Karen","year":"2015"},{"key":"e_1_3_2_2_35_1","unstructured":"Khurram Soomro Amir Roshan Zamir and Mubarak Shah. 2012. UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012).  Khurram Soomro Amir Roshan Zamir and Mubarak Shah. 2012. UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)."},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-88690-7_7"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00931"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"e_1_3_2_2_39_1","unstructured":"Du Tran Jamie Ray Zheng Shou Shih-Fu Chang and Manohar Paluri. 2017. Convnet architecture search for spatiotemporal feature learning. arXiv preprint arXiv:1708.05038 (2017).  Du Tran Jamie Ray Zheng Shou Shih-Fu Chang and Manohar Paluri. 2017. Convnet architecture search for spatiotemporal feature learning. arXiv preprint arXiv:1708.05038 (2017)."},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00565"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00675"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1016\/0896-6273(94)90455-3"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00155"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00813"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00513"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2019.2897902"},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2574712"},{"key":"e_1_3_2_2_49_1","unstructured":"Joachim Weickert Andres Bruhn and Christoph Schn\u00f6rr. 2003. Lucas\/Kanade meets Horn\/Schunck: Combining local and global optic flow methods. (2003).  Joachim Weickert Andres Bruhn and Christoph Schn\u00f6rr. 2003. Lucas\/Kanade meets Horn\/Schunck: Combining local and global optic flow methods. (2003)."},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00631"},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01267-0_19"},{"key":"e_1_3_2_2_52_1","volume-title":"Joint pattern recognition symposium","author":"Zach Christopher"},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.297"},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2791180"},{"key":"e_1_3_2_2_55_1","volume-title":"Asian Conference on Computer Vision. Springer, 363--378","author":"Zhu Yi","year":"2018"},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_43"}],"event":{"name":"MM '20: The 28th ACM International Conference on Multimedia","location":"Seattle WA USA","acronym":"MM '20","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 28th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3394171.3413641","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3394171.3413641","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:47:15Z","timestamp":1750193235000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3394171.3413641"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,10,12]]},"references-count":56,"alternative-id":["10.1145\/3394171.3413641","10.1145\/3394171"],"URL":"https:\/\/doi.org\/10.1145\/3394171.3413641","relation":{},"subject":[],"published":{"date-parts":[[2020,10,12]]},"assertion":[{"value":"2020-10-12","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}