{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,20]],"date-time":"2025-10-20T10:25:07Z","timestamp":1760955907929,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":32,"publisher":"ACM","license":[{"start":{"date-parts":[[2018,10,15]],"date-time":"2018-10-15T00:00:00Z","timestamp":1539561600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Science Foundation of China","award":["61602014"],"award-info":[{"award-number":["61602014"]}]},{"name":"National Engineering Laboratory Shenzhen Division for Video Technology"},{"name":"National Natural Science Foundation of China and Guangdong Province Scientific Research on Big Data","award":["No. U1611461"],"award-info":[{"award-number":["No. U1611461"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2018,10,15]]},"DOI":"10.1145\/3240508.3240659","type":"proceedings-article","created":{"date-parts":[[2018,10,18]],"date-time":"2018-10-18T17:52:08Z","timestamp":1539885128000},"page":"993-1001","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Online Action Tube Detection via Resolving the Spatio-temporal Context Pattern"],"prefix":"10.1145","author":[{"given":"Jingjia","family":"Huang","sequence":"first","affiliation":[{"name":"Peking University, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Nannan","family":"Li","sequence":"additional","affiliation":[{"name":"Peking University Shenzhen Graduate School, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiaxing","family":"Zhong","sequence":"additional","affiliation":[{"name":"Peking University, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Thomas H.","family":"Li","sequence":"additional","affiliation":[{"name":"Gpower Semiconductor Inc, Suzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ge","family":"Li","sequence":"additional","affiliation":[{"name":"Peking University, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2018,10,15]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.223"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2012.175"},{"volume-title":"IEEE Conference on Computer Vision and Pattern Recognition (2016)","author":"Feichtenhofer C.","key":"e_1_3_2_1_3_1","unstructured":"Feichtenhofer C. , Pinz A. , and Zisserman A . 2016. Convolutional two-stream network fusion for video action recognition . IEEE Conference on Computer Vision and Pattern Recognition (2016) , 1933--1941. Feichtenhofer C., Pinz A., and Zisserman A. 2016. Convolutional two-stream network fusion for video action recognition. IEEE Conference on Computer Vision and Pattern Recognition (2016), 1933--1941."},{"key":"e_1_3_2_1_4_1","volume-title":"European Conference on Computer Vision","author":"Oneata D.","year":"2014","unstructured":"Oneata D. , Revaud J. , Verbeek J. , and Schmmid C . 2014. Spatio-temporal object detection proposals . European Conference on Computer Vision ( 2014 ). Oneata D., Revaud J., Verbeek J., and Schmmid C. 2014. Spatio-temporal object detection proposals. European Conference on Computer Vision (2014)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"e_1_3_2_1_6_1","volume-title":"IEEE Conference on Computer Vision and Pattern Recognition","author":"Gkioxari G.","year":"2015","unstructured":"Gkioxari G. and Malik J . 2015. Finding action tubes . IEEE Conference on Computer Vision and Pattern Recognition ( 2015 ), 759--768. Gkioxari G. and Malik J. 2015. Finding action tubes. IEEE Conference on Computer Vision and Pattern Recognition (2015), 759--768."},{"key":"e_1_3_2_1_7_1","unstructured":"Singh G. Saha S. and Cuzzolin F. 2016. Online Real time Multiple Spatiotemporal Action Localisation and Prediction on a Single Platform. ArXiv (2016). https: \/\/doi.org\/arXiv:1611.08563  Singh G. Saha S. and Cuzzolin F. 2016. Online Real time Multiple Spatiotemporal Action Localisation and Prediction on a Single Platform. ArXiv (2016). https: \/\/doi.org\/arXiv:1611.08563"},{"key":"e_1_3_2_1_8_1","unstructured":"Behl H. Sapienza M. Singh G. Saha S. Cuzzolin F. and Torr P.H. 2017. Incremental tube construction for human action detection. ArXiv (2017). https: \/\/doi.org\/arXiv:1704.01358  Behl H. Sapienza M. Singh G. Saha S. Cuzzolin F. and Torr P.H. 2017. Incremental tube construction for human action detection. ArXiv (2017). https: \/\/doi.org\/arXiv:1704.01358"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.441"},{"key":"e_1_3_2_1_10_1","volume-title":"TORNADO: A Spatio-Temporal Convolutional Regression Network for Video Action Proposal. IEEE International Conference on Computer Vision","author":"Zhu H.","year":"2017","unstructured":"Zhu H. , Vial R. , and Lu S . 2017 . TORNADO: A Spatio-Temporal Convolutional Regression Network for Video Action Proposal. IEEE International Conference on Computer Vision ( 2017 ), 5814--5822. Zhu H., Vial R., and Lu S. 2017. TORNADO: A Spatio-Temporal Convolutional Regression Network for Video Action Proposal. IEEE International Conference on Computer Vision (2017), 5814--5822."},{"key":"e_1_3_2_1_11_1","volume-title":"IEEE conference on computer vision and pattern recognition","author":"Donahue J.","year":"2015","unstructured":"Donahue J. , Anne Hendricks L. , Guadarrama S. , Rohrbach M. , Venugopalan S. , Saenko K. , and Darrell T . 2015. Long-term recurrent convolutional networks for visual recognition and description . IEEE conference on computer vision and pattern recognition ( 2015 ), 2625--2634. Donahue J., Anne Hendricks L., Guadarrama S., Rohrbach M., Venugopalan S., Saenko K., and Darrell T. 2015. Long-term recurrent convolutional networks for visual recognition and description. IEEE conference on computer vision and pattern recognition (2015), 2625--2634."},{"key":"e_1_3_2_1_12_1","volume-title":"IEEE conference on computer vision and pattern recognition","author":"Yue-Hei Ng J.","year":"2015","unstructured":"Yue-Hei Ng J. , Hausknecht M. , Vijayanarasimhan S. , Vinyals O. , Monga R. , and Toderici G . 2015. Beyond short snippets: Deep networks for video classification . IEEE conference on computer vision and pattern recognition ( 2015 ), 4694--4702. Yue-Hei Ng J., Hausknecht M., Vijayanarasimhan S., Vinyals O., Monga R., and Toderici G. 2015. Beyond short snippets: Deep networks for video classification. IEEE conference on computer vision and pattern recognition (2015), 4694--4702."},{"key":"e_1_3_2_1_13_1","volume-title":"British Machine Vision Conference","author":"Gemert J.C.","year":"2015","unstructured":"Gemert J.C. , Jain M. , Gati E. , and Snoek C.G . 2015. Apt: Action localization proposals from dense trajectories . British Machine Vision Conference ( 2015 ), 177--184. Gemert J.C., Jain M., Gati E., and Snoek C.G. 2015. Apt: Action localization proposals from dense trajectories. British Machine Vision Conference (2015), 177--184."},{"key":"e_1_3_2_1_14_1","unstructured":"Simonyan K. and Zisserman A. 2014. Very deep convolutional networks for large scale image recognition. ArXiv (2014). https:\/\/doi.org\/arXiv:1409.1556  Simonyan K. and Zisserman A. 2014. Very deep convolutional networks for large scale image recognition. ArXiv (2014). https:\/\/doi.org\/arXiv:1409.1556"},{"key":"e_1_3_2_1_15_1","unstructured":"Soomro K. Zamir A.R. and Shah M. 2012. UCF101: A dataset of 101 human actions classes from videos in the wild. ArXiv (2012). https:\/\/doi.org\/arXiv: 1212.0402 2012  Soomro K. Zamir A.R. and Shah M. 2012. UCF101: A dataset of 101 human actions classes from videos in the wild. ArXiv (2012). https:\/\/doi.org\/arXiv: 1212.0402 2012"},{"key":"e_1_3_2_1_16_1","volume-title":"Action Tubelet Detector for Spatio-Temporal Action Localization. In IEEE International Conference on Computer Vision. 4415--4423","author":"Vicky Kalogeiton","year":"2017","unstructured":"Vicky Kalogeiton , Philippe Weinzaepfel, Vittorio Ferrari , and Cordelia Schmid. 2017 . Action Tubelet Detector for Spatio-Temporal Action Localization. In IEEE International Conference on Computer Vision. 4415--4423 . Vicky Kalogeiton, Philippe Weinzaepfel, Vittorio Ferrari, and Cordelia Schmid. 2017. Action Tubelet Detector for Spatio-Temporal Action Localization. In IEEE International Conference on Computer Vision. 4415--4423."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2015.2465955"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.100"},{"key":"e_1_3_2_1_19_1","volume-title":"IEEE Conference on Computer Vision and Pattern Recognition","author":"Rodriguez M.D.","year":"2008","unstructured":"Rodriguez M.D. , Ahmed J. , and Shah M . 2008. Action mach a spatio-temporal maximum average correlation height filter for action recognition . IEEE Conference on Computer Vision and Pattern Recognition ( 2008 ), 1--8. Rodriguez M.D., Ahmed J., and Shah M. 2008. Action mach a spatio-temporal maximum average correlation height filter for action recognition. IEEE Conference on Computer Vision and Pattern Recognition (2008), 1--8."},{"key":"e_1_3_2_1_20_1","volume-title":"Asian Conference on Computing Vision","author":"Li N.","year":"2012","unstructured":"Li N. , Xu D. , Ying . Z. , Li Z. , and Li G . 2012. Searching action proposals via spatial actionness estimation and temporal path inference and tracking . Asian Conference on Computing Vision ( 2012 ), 384--399. Li N., Xu D., Ying. Z., Li Z., and Li G. 2012. Searching action proposals via spatial actionness estimation and temporal path inference and tracking. Asian Conference on Computing Vision (2012), 384--399."},{"key":"e_1_3_2_1_21_1","volume-title":"IEEE International Conference on Computer Vision","author":"Tokmakov P.","year":"2017","unstructured":"Tokmakov P. , Alahari K. , and Schmid C . 2017. Learning video object segmentation with visual memory . IEEE International Conference on Computer Vision ( 2017 ), 4491--4500. Tokmakov P., Alahari K., and Schmid C. 2017. Learning video object segmentation with visual memory. IEEE International Conference on Computer Vision (2017), 4491--4500."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.362"},{"key":"e_1_3_2_1_23_1","unstructured":"Collobert R. Kavukcuoglu K. and Farabet C. 2011. Torch7: A matlab-like environment for machine learning. Advances in neural information processing systems workshop (2011).  Collobert R. Kavukcuoglu K. and Farabet C. 2011. Torch7: A matlab-like environment for machine learning. Advances in neural information processing systems workshop (2011)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.81"},{"key":"e_1_3_2_1_25_1","unstructured":"Ren S. He K. Girshick R. and Sun J. 2015. Faster R-CNN: Towards real-time object detection with region proposal networks. Advances in neural information processing systems (2015) 91--99.   Ren S. He K. Girshick R. and Sun J. 2015. Faster R-CNN: Towards real-time object detection with region proposal networks. Advances in neural information processing systems (2015) 91--99."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Saha S. Singh G. Sapienza M. Torr P.H. and Cuzzolin F. 2016. Deep learning for detecting multiple space-time action tubes in videos. ArXiv (2016). https: \/\/doi.org\/arXiv:1608.01529  Saha S. Singh G. Sapienza M. Torr P.H. and Cuzzolin F. 2016. Deep learning for detecting multiple space-time action tubes in videos. ArXiv (2016). https: \/\/doi.org\/arXiv:1608.01529","DOI":"10.5244\/C.30.58"},{"key":"e_1_3_2_1_27_1","volume-title":"European Conference on Computer Vision","author":"Jain S.D.","year":"2007","unstructured":"Jain S.D. and Grauman K . 2007. Supervoxel-consistent foreground propagation in video . European Conference on Computer Vision ( 2007 ), 656--671. Jain S.D. and Grauman K. 2007. Supervoxel-consistent foreground propagation in video. European Conference on Computer Vision (2007), 656--671."},{"key":"e_1_3_2_1_28_1","unstructured":"Tieleman T. and Hinton G. 2012. RMSProp. COURSERA:Lecture 6.5 - Neural Networks for Machine Learning (2012).  Tieleman T. and Hinton G. 2012. RMSProp. COURSERA:Lecture 6.5 - Neural Networks for Machine Learning (2012)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Kalogeiton V. Weinzaepfel P. Ferrari V. and Schmid C. 2017. Action Tubelet Detector for Spatio-Temporal Action Localization. ArXiv (2017). https:\/\/doi.org\/ arXiv:1705.01861  Kalogeiton V. Weinzaepfel P. Ferrari V. and Schmid C. 2017. Action Tubelet Detector for Spatio-Temporal Action Localization. ArXiv (2017). https:\/\/doi.org\/ arXiv:1705.01861","DOI":"10.1109\/ICCV.2017.472"},{"key":"e_1_3_2_1_30_1","volume-title":"IEEE Conference on Computer Vision and Pattern Recognition","author":"Sultani W.","year":"2016","unstructured":"Sultani W. and Shah M . 2016. What If We Do Not Have Multiple Videos of the Same Action?--Video Action Localization Using Web Images . IEEE Conference on Computer Vision and Pattern Recognition ( 2016 ), 1077--1085. Sultani W. and Shah M. 2016. What If We Do Not Have Multiple Videos of the Same Action?--Video Action Localization Using Web Images. IEEE Conference on Computer Vision and Pattern Recognition (2016), 1077--1085."},{"volume-title":"IEEE Conference on Computer Vision and Pattern Recognition (2016)","author":"Zhu W.","key":"e_1_3_2_1_31_1","unstructured":"Zhu W. , Hu J. , Sun G. , Cao X. , and Qiao Y . 2016. A key volume mining deep framework for action recognition . IEEE Conference on Computer Vision and Pattern Recognition (2016) , 1991--1999. Zhu W., Hu J., Sun G., Cao X., and Qiao Y. 2016. A key volume mining deep framework for action recognition. IEEE Conference on Computer Vision and Pattern Recognition (2016), 1991--1999."},{"key":"e_1_3_2_1_32_1","volume-title":"European Conference on Computer Vision","author":"Peng X.","year":"2016","unstructured":"Peng X. and Schmid C . 2016. Multi-region two-stream R-CNN for action detection . European Conference on Computer Vision ( 2016 ), 744--759. Peng X. and Schmid C. 2016. Multi-region two-stream R-CNN for action detection. European Conference on Computer Vision (2016), 744--759."}],"event":{"name":"MM '18: ACM Multimedia Conference","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Seoul Republic of Korea","acronym":"MM '18"},"container-title":["Proceedings of the 26th ACM international conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3240508.3240659","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3240508.3240659","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T00:43:31Z","timestamp":1750207411000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3240508.3240659"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,10,15]]},"references-count":32,"alternative-id":["10.1145\/3240508.3240659","10.1145\/3240508"],"URL":"https:\/\/doi.org\/10.1145\/3240508.3240659","relation":{},"subject":[],"published":{"date-parts":[[2018,10,15]]},"assertion":[{"value":"2018-10-15","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}