{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:34:58Z","timestamp":1750221298618,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":24,"publisher":"ACM","license":[{"start":{"date-parts":[[2018,10,15]],"date-time":"2018-10-15T00:00:00Z","timestamp":1539561600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Ministry of Science and ICT","award":["NRF-2017K1A3A1A16066838"],"award-info":[{"award-number":["NRF-2017K1A3A1A16066838"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2018,10,15]]},"DOI":"10.1145\/3265987.3265989","type":"proceedings-article","created":{"date-parts":[[2018,10,17]],"date-time":"2018-10-17T12:18:31Z","timestamp":1539778711000},"page":"21-26","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Learning to Detect, Associate, and Recognize Human Actions and Surrounding Scenes in Untrimmed Videos"],"prefix":"10.1145","author":[{"given":"Jungin","family":"Park","sequence":"first","affiliation":[{"name":"Yonsei University, Seoul, South Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sangryul","family":"Jeon","sequence":"additional","affiliation":[{"name":"Yonsei University, Seoul, South Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Seungryong","family":"Kim","sequence":"additional","affiliation":[{"name":"Yonsei University, Seoul, South Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiyoung","family":"Lee","sequence":"additional","affiliation":[{"name":"Yonsei University, Seoul, South Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sunok","family":"Kim","sequence":"additional","affiliation":[{"name":"Yonsei University, Seoul, South Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kwanghoon","family":"Sohn","sequence":"additional","affiliation":[{"name":"Yonsei University, Seoul, South Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2018,10,15]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Object in Action: An Approach for Combining Action Understanding and Object Perception. CVPR","author":"Gupta A.","year":"2007","unstructured":"A. Gupta and L.S. Davis . 2007. Object in Action: An Approach for Combining Action Understanding and Object Perception. CVPR ( 2007 ). A.Gupta and L.S.Davis. 2007. Object in Action: An Approach for Combining Action Understanding and Object Perception. CVPR (2007)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.223"},{"key":"e_1_3_2_1_3_1","volume-title":"Temporal Activity Detection in Untrimmed Videos with Recurrent Neural Networks. 1st NIPS Workshop on Large Scale Computer Vision Systems","author":"Montes A.","year":"2017","unstructured":"A. Montes , A. Salvador , S. Pascual , and X. Giro i Nieto . 2017 . Temporal Activity Detection in Untrimmed Videos with Recurrent Neural Networks. 1st NIPS Workshop on Large Scale Computer Vision Systems (2017). A.Montes, A.Salvador, S.Pascual, and X.Giro i Nieto. 2017. Temporal Activity Detection in Untrimmed Videos with Recurrent Neural Networks. 1st NIPS Workshop on Large Scale Computer Vision Systems (2017)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2012.175"},{"key":"e_1_3_2_1_5_1","volume-title":"Learning End-to-end Video Classification with Rank-Pooling. ICML","author":"Fernando B.","year":"2016","unstructured":"B. Fernando and S. Gould . 2016. Learning End-to-end Video Classification with Rank-Pooling. ICML ( 2016 ). B.Fernando and S.Gould. 2016. Learning End-to-end Video Classification with Rank-Pooling. ICML (2016)."},{"key":"e_1_3_2_1_6_1","volume":"201","author":"Kingma D.","unstructured":"D. Kingma and J.Ba. 201 4. Adam: A method for stochastic optimization. arXiv (2014). D.Kingma and J.Ba. 2014. Adam: A method for stochastic optimization. arXiv (2014).","journal-title":"J.Ba."},{"key":"e_1_3_2_1_7_1","volume-title":"ActivityNet: A Large- Scale Video Benchmark for Human Activity Understanding. CVPR","author":"Heilborn F.C.","year":"2015","unstructured":"F.C. Heilborn , V. Escorcia , B. Ghanem , and J.C. Niebles . 2015. ActivityNet: A Large- Scale Video Benchmark for Human Activity Understanding. CVPR ( 2015 ). F.C.Heilborn, V.Escorcia, B.Ghanem, and J.C.Niebles. 2015. ActivityNet: A Large- Scale Video Benchmark for Human Activity Understanding. CVPR (2015)."},{"key":"e_1_3_2_1_8_1","volume-title":"Fast Temporal Activity Proposals for Efficient Detection of Human Actions in Untrimmed Videos. CVPR","author":"Heilbron F.C.","year":"2016","unstructured":"F.C. Heilbron , J C. Niebles , and B. Ghanem . 2016. Fast Temporal Activity Proposals for Efficient Detection of Human Actions in Untrimmed Videos. CVPR ( 2016 ). F.C.Heilbron, JC.Niebles, and B.Ghanem. 2016. Fast Temporal Activity Proposals for Efficient Detection of Human Actions in Untrimmed Videos. CVPR (2016)."},{"key":"e_1_3_2_1_9_1","volume-title":"SCC: Semantic Context Cascade for Efficient Action Detection. CVPR","author":"Heilbron F.C.","year":"2017","unstructured":"F.C. Heilbron , W. Barrios , V. Escorcia , and B. Ghanem . 2017 . SCC: Semantic Context Cascade for Efficient Action Detection. CVPR (2017). F.C.Heilbron,W.Barrios, V.Escorcia, and B.Ghanem. 2017. SCC: Semantic Context Cascade for Efficient Action Detection. CVPR (2017)."},{"key":"e_1_3_2_1_10_1","volume-title":"Movie Scene Recognition using Panoramic Frame and Representative Feature Patches. JCST","author":"Gao Y.","year":"2014","unstructured":"G Y. Gao and HD.Ma. 2014. Movie Scene Recognition using Panoramic Frame and Representative Feature Patches. JCST ( 2014 ). GY.Gao and HD.Ma. 2014. Movie Scene Recognition using Panoramic Frame and Representative Feature Patches. JCST (2014)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2013.6475038"},{"key":"e_1_3_2_1_12_1","volume-title":"where and who? classifying events by scene and object recognition. ICCV","author":"Li J.","year":"2007","unstructured":"L.- J. Li and L. Fei-Fei . 2007. What , where and who? classifying events by scene and object recognition. ICCV ( 2007 ). L.-J.Li and L.Fei-Fei. 2007. What, where and who? classifying events by scene and object recognition. ICCV (2007)."},{"key":"e_1_3_2_1_13_1","volume-title":"UntrimmedNets for Weakly Supervised Action Recognition and Detection. CVPR","author":"Wang L.","year":"2017","unstructured":"L. Wang , Y. Xiong , D. Lin , and L.V. Gool . 2017. UntrimmedNets for Weakly Supervised Action Recognition and Detection. CVPR ( 2017 ). L.Wang, Y.Xiong, D.Lin, and L.V.Gool. 2017. UntrimmedNets for Weakly Supervised Action Recognition and Detection. CVPR (2017)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.521"},{"key":"e_1_3_2_1_15_1","volume-title":"Actions in context. CVPR","author":"Marszalek M.","year":"2009","unstructured":"M. Marszalek , I. Laptev , and C. Schmid . 2009. Actions in context. CVPR ( 2009 ). M.Marszalek, I.Laptev, and C.Schmid. 2009. Actions in context. CVPR (2009)."},{"key":"e_1_3_2_1_16_1","volume-title":"Scene and Actions: Combining Multiple Features for Human Action Recognition. ECCV","author":"Ikizler-Cinbis N.","year":"2010","unstructured":"N. Ikizler-Cinbis and S. Sclaroff . 2010. Object , Scene and Actions: Combining Multiple Features for Human Action Recognition. ECCV ( 2010 ). N.Ikizler-Cinbis and S.Sclaroff. 2010. Object, Scene and Actions: Combining Multiple Features for Human Action Recognition. ECCV (2010)."},{"volume-title":"NIPS 2017 Workshop","year":"2017","key":"e_1_3_2_1_17_1","unstructured":"Paszke, Adam, Gross, Sam, Chintala, Soumith, Chanan, Gregory, Yang, Edward, DeVito, Zachary, Lin, Zeming, Desmaison, Alban, Antiga, Luca, Lerer, and Adam. 2017 . Automatic differentiation in PyTorch . NIPS 2017 Workshop (2017). Paszke, Adam, Gross, Sam, Chintala, Soumith, Chanan, Gregory, Yang, Edward, DeVito, Zachary, Lin, Zeming, Desmaison, Alban, Antiga, Luca, Lerer, and Adam. 2017. Automatic differentiation in PyTorch. NIPS 2017 Workshop (2017)."},{"key":"e_1_3_2_1_18_1","volume-title":"YouTube-8M: A Large-Scale Video Classification Benchmark. arXiv","author":"Abu-El-Haija S.","year":"2016","unstructured":"S. Abu-El-Haija , N. Kothari , J. Lee , P. Natsev , G. Toderici , B. Varadarajan , and S. Vijayanarasimhan . 2016. YouTube-8M: A Large-Scale Video Classification Benchmark. arXiv ( 2016 ). S.Abu-El-Haija, N.Kothari, J.Lee, P.Natsev, G.Toderici, B.Varadarajan, and S.Vijayanarasimhan. 2016. YouTube-8M: A Large-Scale Video Classification Benchmark. arXiv (2016)."},{"key":"e_1_3_2_1_19_1","volume-title":"Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariance Shift. ICML","author":"Ioffe S.","year":"2015","unstructured":"S. Ioffe and C. Szegedy . 2015 . Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariance Shift. ICML (2015). S.Ioffe and C.Szegedy. 2015. Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariance Shift. ICML (2015)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-37431-9_46"},{"key":"e_1_3_2_1_21_1","volume-title":"Understanding the difficulty of training deep feedforward neural networks. AISTATS","author":"Glorot X.","year":"2010","unstructured":"X. Glorot and Y. Bengio . 2010. Understanding the difficulty of training deep feedforward neural networks. AISTATS ( 2010 ). X.Glorot and Y.Bengio. 2010. Understanding the difficulty of training deep feedforward neural networks. AISTATS (2010)."},{"key":"e_1_3_2_1_22_1","volume":"201","author":"Peng Y.","unstructured":"Y. Peng , Y. Zhao , and J.Zhang. 201 7. Two-stream Collaborative Learning with Spatial-temporal Attention for Video Classification. IEEE Transactions on Circuits and Systems for Video Technology (2017). Y.Peng, Y.Zhao, and J.Zhang. 2017. Two-stream Collaborative Learning with Spatial-temporal Attention for Video Classification. IEEE Transactions on Circuits and Systems for Video Technology (2017).","journal-title":"J.Zhang."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/2733373.2806222"},{"key":"e_1_3_2_1_24_1","volume-title":"Harnessing Object and Scene Semantics for Large-Scale Video Understanding. CVPR","author":"Wu Z.","year":"2016","unstructured":"Z. Wu , Y. Fu , Y. Jiang , and L. Sigal . 2016. Harnessing Object and Scene Semantics for Large-Scale Video Understanding. CVPR ( 2016 ). Z.Wu, Y.Fu, Y.Jiang, and L.Sigal. 2016. Harnessing Object and Scene Semantics for Large-Scale Video Understanding. CVPR (2016)."}],"event":{"name":"MM '18: ACM Multimedia Conference","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Seoul Republic of Korea","acronym":"MM '18"},"container-title":["Proceedings of the 1st Workshop and Challenge on Comprehensive Video Understanding in the Wild"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3265987.3265989","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3265987.3265989","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T02:13:15Z","timestamp":1750212795000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3265987.3265989"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,10,15]]},"references-count":24,"alternative-id":["10.1145\/3265987.3265989","10.1145\/3265987"],"URL":"https:\/\/doi.org\/10.1145\/3265987.3265989","relation":{},"subject":[],"published":{"date-parts":[[2018,10,15]]},"assertion":[{"value":"2018-10-15","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}