{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,21]],"date-time":"2026-02-21T19:48:18Z","timestamp":1771703298151,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":70,"publisher":"ACM","license":[{"start":{"date-parts":[[2019,10,15]],"date-time":"2019-10-15T00:00:00Z","timestamp":1571097600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Beijing Municipal Commission of Science and Technology","award":["Z181100008918005"],"award-info":[{"award-number":["Z181100008918005"]}]},{"name":"National Natural Science Foundation of China (NSFC)","award":["61772037"],"award-info":[{"award-number":["61772037"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2019,10,15]]},"DOI":"10.1145\/3343031.3351029","type":"proceedings-article","created":{"date-parts":[[2019,10,21]],"date-time":"2019-10-21T16:32:26Z","timestamp":1571675546000},"page":"2142-2151","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":33,"title":["Fast Non-Local Neural Networks with Spectral Residual Learning"],"prefix":"10.1145","author":[{"given":"Lu","family":"Chi","sequence":"first","affiliation":[{"name":"Peking University, Beijing, China"}]},{"given":"Guiyu","family":"Tian","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]},{"given":"Yadong","family":"Mu","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]},{"given":"Lingxi","family":"Xie","sequence":"additional","affiliation":[{"name":"Noah's Ark Lab, Huawei, Beijing, China"}]},{"given":"Qi","family":"Tian","sequence":"additional","affiliation":[{"name":"Noah's Ark Lab, Huawei, Shenzhen, China"}]}],"member":"320","published-online":{"date-parts":[[2019,10,15]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"YouTube-8M: A Large-Scale Video Classification Benchmark. CoRR","author":"Abu-El-Haija Sami","year":"2016"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/T-C.1974.223784"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Mykhaylo Andriluka Stefan Roth and Bernt Schiele. 2009. Pictorial structures revisited: People detection and articulated pose estimation. In CVPR . 1014--1021.  Mykhaylo Andriluka Stefan Roth and Bernt Schiele. 2009. Pictorial structures revisited: People detection and articulated pose estimation. In CVPR . 1014--1021.","DOI":"10.1109\/CVPR.2009.5206754"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Mykhaylo Andriluka Stefan Roth and Bernt Schiele. 2010. Monocular 3D pose estimation and tracking by detection. In CVPR. 623--630.  Mykhaylo Andriluka Stefan Roth and Bernt Schiele. 2010. Monocular 3D pose estimation and tracking by detection. In CVPR. 623--630.","DOI":"10.1109\/CVPR.2010.5540156"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Jo a o Carreira and Andrew Zisserman. 2017. Quo Vadis Action Recognition? A New Model and the Kinetics Dataset. In CVPR . 4724--4733.  Jo a o Carreira and Andrew Zisserman. 2017. Quo Vadis Action Recognition? A New Model and the Kinetics Dataset. In CVPR . 4724--4733.","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2699184"},{"key":"e_1_3_2_1_7_1","volume-title":"Rethinking Atrous Convolution for Semantic Image Segmentation. CoRR","author":"Chen Liang-Chieh","year":"2017"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Liang-Chieh Chen Yukun Zhu George Papandreou Florian Schroff and Hartwig Adam. 2018 d. Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation. In ECCV .  Liang-Chieh Chen Yukun Zhu George Papandreou Florian Schroff and Hartwig Adam. 2018 d. Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation. In ECCV .","DOI":"10.1007\/978-3-030-01234-2_49"},{"key":"e_1_3_2_1_9_1","unstructured":"Yunpeng Chen Yannis Kalantidis Jianshu Li Shuicheng Yan and Jiashi Feng. 2018a. A^ 2-Nets: Double Attention Networks. In NIPS. 350--359.  Yunpeng Chen Yannis Kalantidis Jianshu Li Shuicheng Yan and Jiashi Feng. 2018a. A^ 2-Nets: Double Attention Networks. In NIPS. 350--359."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"Yilun Chen Zhicheng Wang Yuxiang Peng Zhiqiang Zhang Gang Yu and Jian Sun. 2018c. Cascaded Pyramid Network for Multi-Person Pose Estimation. In CVPR . 7103--7112.  Yilun Chen Zhicheng Wang Yuxiang Peng Zhiqiang Zhang Gang Yu and Jian Sun. 2018c. Cascaded Pyramid Network for Multi-Person Pose Estimation. In CVPR . 7103--7112.","DOI":"10.1109\/CVPR.2018.00742"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1090\/S0025-5718-1965-0178586-1"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Jifeng Dai Haozhi Qi Yuwen Xiong Yi Li Guodong Zhang Han Hu and Yichen Wei. 2017. Deformable Convolutional Networks. In ICCV .  Jifeng Dai Haozhi Qi Yuwen Xiong Yi Li Guodong Zhang Han Hu and Yichen Wei. 2017. Deformable Convolutional Networks. In ICCV .","DOI":"10.1109\/ICCV.2017.89"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1023\/B:VISI.0000042934.15159.49"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.1976.1674569"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Jun Fu Jing Liu Haijie Tian Zhiwei Fang and Hanqing Lu. 2019. Dual Attention Network for Scene Segmentation. (2019).  Jun Fu Jing Liu Haijie Tian Zhiwei Fang and Hanqing Lu. 2019. Dual Attention Network for Scene Segmentation. (2019).","DOI":"10.1109\/CVPR.2019.00326"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Ross B. Girshick Jeff Donahue Trevor Darrell and Jitendra Malik. 2014. Rich Feature Hierarchies for Accurate Object Detection and Semantic Segmentation. In CVPR . 580--587.  Ross B. Girshick Jeff Donahue Trevor Darrell and Jitendra Malik. 2014. Rich Feature Hierarchies for Accurate Object Detection and Semantic Segmentation. In CVPR . 580--587.","DOI":"10.1109\/CVPR.2014.81"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"Georgia Gkioxari Bharath Hariharan Ross B. Girshick and Jitendra Malik. 2014. Using k-Poselets for Detecting People and Localizing Their Keypoints. In CVPR . 3582--3589.  Georgia Gkioxari Bharath Hariharan Ross B. Girshick and Jitendra Malik. 2014. Using k-Poselets for Detecting People and Localizing Their Keypoints. In CVPR . 3582--3589.","DOI":"10.1109\/CVPR.2014.458"},{"key":"e_1_3_2_1_18_1","first-page":"3","article-title":"The \u201csomething something\u201d video database for learning and evaluating visual common sense","volume":"1","author":"Goyal Raghav","year":"2017","journal-title":"ICCV"},{"key":"e_1_3_2_1_19_1","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep Residual Learning for Image Recognition. In CVPR. 770--778.  Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep Residual Learning for Image Recognition. In CVPR. 770--778."},{"key":"e_1_3_2_1_20_1","volume-title":"Weinberger","author":"Huang Gao","year":"2017"},{"key":"e_1_3_2_1_21_1","volume-title":"CCNet: Criss-Cross Attention for Semantic Segmentation. arXiv preprint arXiv:1811.11721","author":"Huang Zilong","year":"2018"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Eldar Insafutdinov Leonid Pishchulin Bjoern Andres Mykhaylo Andriluka and Bernt Schiele. 2016. DeeperCut: A Deeper Stronger and Faster Multi-person Pose Estimation Model. In ECCV . 34--50.  Eldar Insafutdinov Leonid Pishchulin Bjoern Andres Mykhaylo Andriluka and Bernt Schiele. 2016. DeeperCut: A Deeper Stronger and Faster Multi-person Pose Estimation Model. In ECCV . 34--50.","DOI":"10.1007\/978-3-319-46466-4_3"},{"key":"e_1_3_2_1_23_1","volume-title":"Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift. In ICML. 448--456.","author":"Ioffe Sergey","year":"2015"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Umar Iqbal and Juergen Gall. 2016. Multi-person Pose Estimation with Local Joint-to-Person Associations. In ECCV . 627--642.  Umar Iqbal and Juergen Gall. 2016. Multi-person Pose Estimation with Local Joint-to-Person Associations. In ECCV . 627--642.","DOI":"10.1007\/978-3-319-48881-3_44"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2012.59"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Leonid Karlinsky and Shimon Ullman. 2012. Using Linking Features in Learning Non-parametric Part Models. In ECCV. 326--339.  Leonid Karlinsky and Shimon Ullman. 2012. Using Linking Features in Learning Non-parametric Part Models. In ECCV. 326--339.","DOI":"10.1007\/978-3-642-33712-3_24"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","unstructured":"Andrej Karpathy George Toderici Sanketh Shetty Thomas Leung Rahul Sukthankar and Fei-Fei Li. 2014. Large-Scale Video Classification with Convolutional Neural Networks. In CVPR . 1725--1732.  Andrej Karpathy George Toderici Sanketh Shetty Thomas Leung Rahul Sukthankar and Fei-Fei Li. 2014. Large-Scale Video Classification with Convolutional Neural Networks. In CVPR . 1725--1732.","DOI":"10.1109\/CVPR.2014.223"},{"key":"e_1_3_2_1_28_1","volume-title":"An Introduction to Harmonic Analysis","author":"Katznelson Yitzhak"},{"key":"e_1_3_2_1_29_1","volume-title":"Karen Simonyan, Brian Zhang, Chloe Hillier, Sudheendra Vijayanarasimhan, Fabio Viola, Tim Green, Trevor Back, Paul Natsev, Mustafa Suleyman, and Andrew Zisserman.","author":"Kay Will","year":"2017"},{"key":"e_1_3_2_1_30_1","volume-title":"Bin Xiao and Jingdong Wang","author":"Ke Sun Dong Liu","year":"2019"},{"key":"e_1_3_2_1_31_1","volume-title":"Kingma and Jimmy Ba","author":"Diederik","year":"2014"},{"key":"e_1_3_2_1_32_1","volume-title":"Hinton","author":"Krizhevsky Alex","year":"2012"},{"key":"e_1_3_2_1_33_1","volume-title":"Huttenlocher","author":"Lan Xiangyang","year":"2005"},{"key":"e_1_3_2_1_34_1","volume-title":"Piotr Doll\u00e1 r, and C. Lawrence Zitnick","author":"Lin Tsung-Yi","year":"2014"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"crossref","unstructured":"Jonathan Long Evan Shelhamer and Trevor Darrell. 2015. Fully convolutional networks for semantic segmentation. In CVPR . 3431--3440.  Jonathan Long Evan Shelhamer and Trevor Darrell. 2015. Fully convolutional networks for semantic segmentation. In CVPR . 3431--3440.","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"crossref","unstructured":"Xiang Long Chuang Gan Gerard de Melo Jiajun Wu Xiao Liu and Shilei Wen. 2018. Attention clusters: Purely attention based local feature integration for video classification. In CVPR. 7834--7843.  Xiang Long Chuang Gan Gerard de Melo Jiajun Wu Xiao Liu and Shilei Wen. 2018. Attention clusters: Purely attention based local feature integration for video classification. In CVPR. 7834--7843.","DOI":"10.1109\/CVPR.2018.00817"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Alejandro Newell Kaiyu Yang and Jia Deng. 2016. Stacked Hourglass Networks for Human Pose Estimation. In ECCV. 483--499.  Alejandro Newell Kaiyu Yang and Jia Deng. 2016. Stacked Hourglass Networks for Human Pose Estimation. In ECCV. 483--499.","DOI":"10.1007\/978-3-319-46484-8_29"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"crossref","unstructured":"Juan Carlos Niebles Chih-Wei Chen and Fei-Fei Li. 2010. Modeling Temporal Structure of Decomposable Motion Segments for Activity Classification. In ECCV . 392--405.  Juan Carlos Niebles Chih-Wei Chen and Fei-Fei Li. 2010. Modeling Temporal Structure of Decomposable Motion Segments for Activity Classification. In ECCV . 392--405.","DOI":"10.1007\/978-3-642-15552-9_29"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"Wanli Ouyang Xiao Chu and Xiaogang Wang. 2014. Multi-source Deep Learning for Human Pose Estimation. In CVPR . 2337--2344.  Wanli Ouyang Xiao Chu and Xiaogang Wang. 2014. Multi-source Deep Learning for Human Pose Estimation. In CVPR . 2337--2344.","DOI":"10.1109\/CVPR.2014.299"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"crossref","unstructured":"Leonid Pishchulin Eldar Insafutdinov Siyu Tang Bjoern Andres Mykhaylo Andriluka Peter V. Gehler and Bernt Schiele. 2016. DeepCut: Joint Subset Partition and Labeling for Multi Person Pose Estimation. In CVPR . 4929--4937.  Leonid Pishchulin Eldar Insafutdinov Siyu Tang Bjoern Andres Mykhaylo Andriluka Peter V. Gehler and Bernt Schiele. 2016. DeepCut: Joint Subset Partition and Labeling for Multi Person Pose Estimation. In CVPR . 4929--4937.","DOI":"10.1109\/CVPR.2016.533"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"Leonid Pishchulin Arjun Jain Mykhaylo Andriluka Thorsten Thorm\"a hlen and Bernt Schiele. 2012. Articulated people detection and pose estimation: Reshaping the future. In CVPR . 3178--3185.  Leonid Pishchulin Arjun Jain Mykhaylo Andriluka Thorsten Thorm\"a hlen and Bernt Schiele. 2012. Articulated people detection and pose estimation: Reshaping the future. In CVPR . 3178--3185.","DOI":"10.1109\/CVPR.2012.6248052"},{"key":"e_1_3_2_1_42_1","unstructured":"Zhaofan Qiu Ting Yao and Tao Mei. 2017. Learning Spatio-Temporal Representation with Pseudo-3D Residual Networks. In ICCV . 5534--5542.  Zhaofan Qiu Ting Yao and Tao Mei. 2017. Learning Spatio-Temporal Representation with Pseudo-3D Residual Networks. In ICCV . 5534--5542."},{"key":"e_1_3_2_1_43_1","unstructured":"Deva Ramanan David A. Forsyth and Andrew Zisserman. 2005. Strike a Pose: Tracking People by Finding Stylized Poses. In CVPR. 271--278.  Deva Ramanan David A. Forsyth and Andrew Zisserman. 2005. Strike a Pose: Tracking People by Finding Stylized Poses. In CVPR. 271--278."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1137\/1031127"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2577031"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"e_1_3_2_1_47_1","volume-title":"Black","author":"Sigal Leonid","year":"2006"},{"key":"e_1_3_2_1_48_1","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Two-Stream Convolutional Networks for Action Recognition in Videos. In NIPS . 568--576.  Karen Simonyan and Andrew Zisserman. 2014. Two-Stream Convolutional Networks for Action Recognition in Videos. In NIPS . 568--576."},{"key":"e_1_3_2_1_49_1","unstructured":"Karen Simonyan and Andrew Zisserman. 2015. Very Deep Convolutional Networks for Large-Scale Image Recognition. In ICLR .  Karen Simonyan and Andrew Zisserman. 2015. Very Deep Convolutional Networks for Large-Scale Image Recognition. In ICLR ."},{"key":"e_1_3_2_1_50_1","unstructured":"Ke Sun Mingjie Li Dong Liu and Jingdong Wang. 2018. IGCV3: Interleaved Low-Rank Group Convolutions for Efficient Deep Neural Networks. In BMVC . 101.  Ke Sun Mingjie Li Dong Liu and Jingdong Wang. 2018. IGCV3: Interleaved Low-Rank Group Convolutions for Efficient Deep Neural Networks. In BMVC . 101."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","unstructured":"Min Sun and Silvio Savarese. 2011. Articulated part-based model for joint object detection and pose estimation. In ICCV . 723--730.  Min Sun and Silvio Savarese. 2011. Articulated part-based model for joint object detection and pose estimation. In ICCV . 723--730.","DOI":"10.1109\/ICCV.2011.6126309"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"crossref","unstructured":"Alexander Toshev and Christian Szegedy. 2014. DeepPose: Human Pose Estimation via Deep Neural Networks. In CVPR. 1653--1660.  Alexander Toshev and Christian Szegedy. 2014. DeepPose: Human Pose Estimation via Deep Neural Networks. In CVPR. 1653--1660.","DOI":"10.1109\/CVPR.2014.214"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"crossref","unstructured":"Du Tran Lubomir D. Bourdev Rob Fergus Lorenzo Torresani and Manohar Paluri. 2015. Learning Spatiotemporal Features with 3D Convolutional Networks. In ICCV . 4489--4497.  Du Tran Lubomir D. Bourdev Rob Fergus Lorenzo Torresani and Manohar Paluri. 2015. Learning Spatiotemporal Features with 3D Convolutional Networks. In ICCV . 4489--4497.","DOI":"10.1109\/ICCV.2015.510"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"crossref","unstructured":"Du Tran Heng Wang Lorenzo Torresani Jamie Ray Yann LeCun and Manohar Paluri. 2018. A Closer Look at Spatiotemporal Convolutions for Action Recognition. In CVPR . 6450--6459.  Du Tran Heng Wang Lorenzo Torresani Jamie Ray Yann LeCun and Manohar Paluri. 2018. A Closer Look at Spatiotemporal Convolutions for Action Recognition. In CVPR . 6450--6459.","DOI":"10.1109\/CVPR.2018.00675"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2712608"},{"key":"e_1_3_2_1_56_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N. Gomez Lukasz Kaiser and Illia Polosukhin. 2017. Attention is All you Need. In NIPS. 6000--6010.  Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N. Gomez Lukasz Kaiser and Illia Polosukhin. 2017. Attention is All you Need. In NIPS. 6000--6010."},{"key":"e_1_3_2_1_57_1","volume-title":"Alexander Kl\"a ser, Ivan Laptev, and Cordelia Schmid.","author":"Wang Heng","year":"2009"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"crossref","unstructured":"Limin Wang Wei Li Wen Li and Luc Van Gool. 2018b. Appearance-and-Relation Networks for Video Classification. In CVPR .  Limin Wang Wei Li Wen Li and Luc Van Gool. 2018b. Appearance-and-Relation Networks for Video Classification. In CVPR .","DOI":"10.1109\/CVPR.2018.00155"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"crossref","unstructured":"Limin Wang Yuanjun Xiong Zhe Wang Yu Qiao Dahua Lin Xiaoou Tang and Luc Van Gool. 2016. Temporal Segment Networks: Towards Good Practices for Deep Action Recognition. In ECCV . 20--36.  Limin Wang Yuanjun Xiong Zhe Wang Yu Qiao Dahua Lin Xiaoou Tang and Luc Van Gool. 2016. Temporal Segment Networks: Towards Good Practices for Deep Action Recognition. In ECCV . 20--36.","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"e_1_3_2_1_60_1","volume-title":"Non-local Neural Networks. CVPR","author":"Wang Xiaolong","year":"2018"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"crossref","unstructured":"Yang Wang and Greg Mori. 2008. Multiple Tree Models for Occlusion and Spatial Constraints in Human Pose Estimation. In ECCV . 710--724.  Yang Wang and Greg Mori. 2008. Multiple Tree Models for Occlusion and Spatial Constraints in Human Pose Estimation. In ECCV . 710--724.","DOI":"10.1007\/978-3-540-88690-7_53"},{"key":"e_1_3_2_1_62_1","unstructured":"Shih-En Wei Varun Ramakrishna Takeo Kanade and Yaser Sheikh. 2016. Convolutional Pose Machines. In CVPR. 4724--4732.  Shih-En Wei Varun Ramakrishna Takeo Kanade and Yaser Sheikh. 2016. Convolutional Pose Machines. In CVPR. 4724--4732."},{"key":"e_1_3_2_1_63_1","unstructured":"Zuxuan Wu Xi Wang Yu-Gang Jiang Hao Ye and Xiangyang Xue. 2015. Modeling Spatial-Temporal Clues in a Hybrid Deep Learning Framework for Video Classification. In ACM Multimedia. 461--470.  Zuxuan Wu Xi Wang Yu-Gang Jiang Hao Ye and Xiangyang Xue. 2015. Modeling Spatial-Temporal Clues in a Hybrid Deep Learning Framework for Video Classification. In ACM Multimedia. 461--470."},{"key":"e_1_3_2_1_64_1","volume-title":"Davis","author":"Wu Zuxuan","year":"2018"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"crossref","unstructured":"Bin Xiao Haiping Wu and Yichen Wei. 2018. Simple Baselines for Human Pose Estimation and Tracking. In ECCV. 472--487.  Bin Xiao Haiping Wu and Yichen Wei. 2018. Simple Baselines for Human Pose Estimation and Tracking. In ECCV. 472--487.","DOI":"10.1007\/978-3-030-01231-1_29"},{"key":"e_1_3_2_1_66_1","volume-title":"Rethinking Spatiotemporal Feature Learning For Video Understanding. CoRR","author":"Xie Saining","year":"2017"},{"key":"e_1_3_2_1_67_1","unstructured":"Kaiyu Yue Ming Sun Yuchen Yuan Feng Zhou Errui Ding and Fuxin Xu. 2018. Compact Generalized Non-local Network. In NIPS. 6511--6520.  Kaiyu Yue Ming Sun Yuchen Yuan Feng Zhou Errui Ding and Fuxin Xu. 2018. Compact Generalized Non-local Network. In NIPS. 6511--6520."},{"key":"e_1_3_2_1_68_1","volume-title":"Self-Attention Generative Adversarial Networks. CoRR","author":"Zhang Han","year":"2018"},{"key":"e_1_3_2_1_69_1","unstructured":"Hengshuang Zhao Jianping Shi Xiaojuan Qi Xiaogang Wang and Jiaya Jia. 2017. Pyramid Scene Parsing Network. In CVPR. 6230--6239.  Hengshuang Zhao Jianping Shi Xiaojuan Qi Xiaogang Wang and Jiaya Jia. 2017. Pyramid Scene Parsing Network. In CVPR. 6230--6239."},{"key":"e_1_3_2_1_70_1","volume-title":"ECO: Efficient Convolutional Network for Online Video Understanding. In ECCV .","author":"Zolfaghari Mohammadreza","year":"2018"}],"event":{"name":"MM '19: The 27th ACM International Conference on Multimedia","location":"Nice France","acronym":"MM '19","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 27th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3343031.3351029","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3343031.3351029","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T23:13:11Z","timestamp":1750201991000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3343031.3351029"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,10,15]]},"references-count":70,"alternative-id":["10.1145\/3343031.3351029","10.1145\/3343031"],"URL":"https:\/\/doi.org\/10.1145\/3343031.3351029","relation":{},"subject":[],"published":{"date-parts":[[2019,10,15]]},"assertion":[{"value":"2019-10-15","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}