{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:19:21Z","timestamp":1750220361123,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":61,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,8,17]],"date-time":"2021-08-17T00:00:00Z","timestamp":1629158400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,8,17]]},"DOI":"10.1145\/3465481.3470059","type":"proceedings-article","created":{"date-parts":[[2021,8,16]],"date-time":"2021-08-16T18:07:25Z","timestamp":1629137245000},"page":"1-10","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["What should we pay attention to when classifying violent videos?"],"prefix":"10.1145","author":[{"given":"Marcos Vin\u00edcius","family":"Ad\u00e3o Teixeira","sequence":"first","affiliation":[{"name":"University of Campinas (Unicamp), BR"}]},{"given":"Sandra","family":"Avila","sequence":"additional","affiliation":[{"name":"University of Campinas (Unicamp), BR"}]}],"member":"320","published-online":{"date-parts":[[2021,8,17]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2016.05.050"},{"key":"e_1_3_2_1_2_1","unstructured":"D. Bahdanau K. Cho and Y. Bengio. 2015. Neural Machine Translation by Jointly Learning to Align and Translate. In ICLR."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"E. Bermejo O. Deniz G. Bueno and R. Sukthankar. 2011. Violence detection in video using computer vision techniques. In CAIP. 332\u2013339.","DOI":"10.1007\/978-3-642-23678-5_39"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"P. Bilinski and F. Bremond. 2016. Human violence recognition and detection in surveillance videos. In IEEE AVSS. 30\u201336.","DOI":"10.1109\/AVSS.2016.7738019"},{"volume-title":"Improving the Dense Trajectories Approach Towards Efficient Recognition of Simple Human Activities","author":"Camarena F.","key":"e_1_3_2_1_5_1","unstructured":"F. Camarena, L. Chang, and M.l Gonzalez-Mendoza. 2019. Improving the Dense Trajectories Approach Towards Efficient Recognition of Simple Human Activities. In IEEE IWBF. 1\u20136."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"S. Carneiro G. da Silva S. Guimaraes and H. Pedrini. 2019. Fight Detection in Video Sequences Based on Multi-Stream Convolutional Neural Networks. In SIBGRAPI. 8\u201315.","DOI":"10.1109\/SIBGRAPI.2019.00010"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","unstructured":"J. Carreira and A. Zisserman. 2017. Quo vadis action recognition? A new model and the kinetics dataset. In CVPR. 6299\u20136308.","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_1_8_1","volume-title":"Details: Delving Deep into Convolutional Nets. In BMVC.","author":"Chatfield K.","year":"2014","unstructured":"K. Chatfield, K. Simonyan, A. Vedaldi, and A. Zisserman. 2014. Return of the Devil in the Details: Delving Deep into Convolutional Nets. In BMVC."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"L. Chen H. Hsu L. Wang and C. Su. 2011. Violence detection in movies. In CGiV. 119\u2013124.","DOI":"10.1109\/CGIV.2011.14"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"J. Cheng L. Dong and M. Lapata. 2016. Long Short-Term Memory-Networks for Machine Reading. In EMNLP. 551\u2013561.","DOI":"10.18653\/v1\/D16-1053"},{"key":"e_1_3_2_1_11_1","unstructured":"M. Cheng K. Cai and M. Li. 2019. RWF-2000: An Open Large Scale Video Database for Violence Detection. arXiv preprint arXiv:1911.05913(2019)."},{"key":"e_1_3_2_1_12_1","volume-title":"Torch: A Modular Machine Learning Software Library.","author":"Collobert R.","year":"2002","unstructured":"R. Collobert, S. Bengio, and J. Marithoz. 2002. Torch: A Modular Machine Learning Software Library."},{"key":"e_1_3_2_1_13_1","volume-title":"Multimedia: Benchmarking Violent Scenes Detection","author":"Constantin M.","year":"2020","unstructured":"M. Constantin, L. Stefan, B. Ionescu, C. Demarty, M. Sjoberg, M. Schedl, and G. Gravier. 2020. Affect in Multimedia: Benchmarking Violent Scenes Detection. IEEE Transactions on Affective Computing(2020)."},{"key":"e_1_3_2_1_14_1","unstructured":"Q. Dai R. Zhao Z. Wu X. Wang Z. Gu W. Wu and Y. Jiang. 2015. Fudan-Huawei at MediaEval 2015: Detecting Violent Scenes and Affective Impact in Movies with Deep Learning.. In MediaEval."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"A. Datta M. Shah and N. Lobo. 2002. Person-on-person violence detection in video data. In ICPR Vol.\u00a01. 433\u2013438.","DOI":"10.1109\/ICPR.2002.1044748"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"S. Davis and P. Mermelstein. 1990. Comparison of parametric representations for monosyllabic word recognition in continuously spoken sentences. In Readings in Speech Recognition. 65\u201374.","DOI":"10.1016\/B978-0-08-051584-7.50010-3"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"F. De\u00a0Souza G. Chavez E. do Valle\u00a0Jr and A. Ara\u00fajo. 2010. Violence detection in video using spatio-temporal features. In SIBGRAPI. 224\u2013230.","DOI":"10.1109\/SIBGRAPI.2010.38"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","unstructured":"C. Demarty B. Ionescu Y. Jiang V. Quang M. Schedl and C. Penet. 2014. Benchmarking violent scenes detection in movies. In CBMI. 1\u20136.","DOI":"10.1109\/CBMI.2014.6849827"},{"key":"e_1_3_2_1_19_1","unstructured":"O. Deniz I. Serrano G. Bueno and T. Kim. 2014. Fast violence detection in video. In IEEE VISAPP Vol.\u00a02. 478\u2013485."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Z. Dong J. Qin and Y. Wang. 2016. Multi-stream deep networks for person to person violence detection in videos. In CCPR. 517\u2013531.","DOI":"10.1007\/978-981-10-3002-4_43"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"H. Doughty W. Mayol-Cuevas and D. Damen. 2019. The pros and cons: Rank-aware temporal attention for skill determination in long videos. In CVPR.","DOI":"10.1109\/CVPR.2019.00805"},{"key":"e_1_3_2_1_22_1","unstructured":"I. Febin K. Jayasree and P. Joy. 2019. Violence detection in videos for an intelligent surveillance system using MoBSIFT and movement filtering algorithm. Pattern Analysis and Applications(2019) 1\u201313."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2016.01.006"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"R. Girdhar J. Carreira C. Doersch and A. Zisserman. 2019. Video action transformer network. In CVPR. 244\u2013253.","DOI":"10.1109\/CVPR.2019.00033"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"Y. Gong W. Wang S. Jiang Q. Huang and W. Gao. 2008. Detecting violent scenes in movies by auditory and visual cues. In IEEE PCM. 317\u2013326.","DOI":"10.1007\/978-3-540-89796-5_33"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"K. Hayashi M. Seki T. Hirai T. Koichi and S. Koichi. 2005. Real-time violent action detector for elevator. In Optomechatronic Machine Vision Vol.\u00a06051. 60510R.","DOI":"10.1117\/12.648790"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","unstructured":"L. Huang W. Wang J. Chen and X. Wei. 2019. Attention on attention for image captioning. In ICCV. 4634\u20134643.","DOI":"10.1109\/ICCV.2019.00473"},{"key":"e_1_3_2_1_28_1","unstructured":"A. Krizhevsky I. Sutskever and G. Hinton. 2012. Imagenet classification with deep convolutional neural networks. In NeurIPS. 1097\u20131105."},{"key":"e_1_3_2_1_29_1","unstructured":"V. Lam S. Le D.-D. Le S. Satoh and D.\u00a0A. Duong. 2015. NII-UIT at MediaEval 2015 Affective Impact of Movies Task.. In MediaEval."},{"volume-title":"ACM on Multimedia Conference. 586\u2013590","author":"Li X.","key":"e_1_3_2_1_30_1","unstructured":"X. Li, Y. Huo, Q. Jin, and J. Xu. 2016. Detecting Violence in Video using Subclasses. In ACM on Multimedia Conference. 586\u2013590."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2017.10.011"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"X. Long C. Gan G. De\u00a0Melo X. Liu Y. Li F. Li and S. Wen. 2018. Multimodal keyless attention fusion for video classification. In AAAI.","DOI":"10.1609\/aaai.v32i1.12319"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"X. Long C. Gan G. De\u00a0Melo J. Wu X. Liu and S. Wen. 2018. Attention clusters: Purely attention based local feature integration for video classification. In CVPR.","DOI":"10.1109\/CVPR.2018.00817"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"crossref","unstructured":"L. Meng B. Zhao B. Chang G. Huang W. Sun F. Tung and L. Sigal. 2019. Interpretable spatio-temporal attention for video action recognition. In ICCV.","DOI":"10.1109\/ICCVW.2019.00189"},{"volume-title":"CEUR Workshop 1436(2015)","author":"Moreira D.","key":"e_1_3_2_1_35_1","unstructured":"D. Moreira, S. Avila, M. Perez, D. Moraes, V. Testoni, E. Valle, S. Goldenstein, and A. Rocha. 2015. RECOD at MediaEval 2015: Affective impact of movies task. CEUR Workshop 1436(2015)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2018.03.001"},{"key":"e_1_3_2_1_37_1","volume-title":"WVD: A New Synthetic Dataset for Video-Based Violence Detection. In SGAI. 158\u2013164.","author":"Nadeem M.","year":"2019","unstructured":"M. Nadeem, V. Franqueira, F. Kurugollu, and X. Zhai. 2019. WVD: A New Synthetic Dataset for Video-Based Violence Detection. In SGAI. 158\u2013164."},{"key":"e_1_3_2_1_38_1","unstructured":"N. Parmar A. Vaswani J. Uszkoreit L. Kaiser N. Shazeer A. Ku and D. Tran. 2018. Image Transformer. In ICML. 4055\u20134064."},{"volume-title":"PyTorch: An Imperative Style","author":"Paszke A.","key":"e_1_3_2_1_39_1","unstructured":"A. Paszke, S. Gross, F. Massa, A. Lerer, 2019. PyTorch: An Imperative Style, High-Performance Deep Learning Library. In NeurIPS. 8024\u20138035."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"crossref","unstructured":"W. Pei T. Baltru\u0161aitis D. Tax and L. Morency. 2017. Temporal attention-gated model for robust sequence classification. In CVPR. 820\u2013829.","DOI":"10.1109\/CVPR.2017.94"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"B. Peixoto S. Avila Z. Dias and A. Rocha. 2018. Breaking down violence: A deep-learning strategy to model and classify violence in videos. In ARES. 1\u20137.","DOI":"10.1145\/3230833.3232809"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","unstructured":"B. Peixoto B. Lavi P. Bestagini Z. Dias and A. Rocha. 2020. Multimodal Violence Detection in Videos. In IEEE ICASSP. 2957\u20132961.","DOI":"10.1109\/ICASSP40776.2020.9054018"},{"volume-title":"Toward subjective violence detection in videos","author":"Peixoto B.","key":"e_1_3_2_1_43_1","unstructured":"B. Peixoto, B. Lavi, J. Martin, S. Avila, Z. Dias, and An. Rocha. 2019. Toward subjective violence detection in videos. In IEEE ICASSP. 8276\u20138280."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"crossref","unstructured":"Y. Peng Y. Zhao and J. Zhang. 2018. Two-stream collaborative learning with spatial-temporal attention for video classification. IEEE TCSVT 29 3 (2018).","DOI":"10.1109\/TCSVT.2018.2808685"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2017.2738401"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2019.2932114"},{"key":"e_1_3_2_1_47_1","unstructured":"S. Sharma R. Kiros and R. Salakhutdinov. 2015. Action recognition using visual attention. arXiv preprint arXiv:1511.04119(2015)."},{"key":"e_1_3_2_1_48_1","unstructured":"M. Sj\u00f6berg Y. Baveye H. Wang V. Quang B. Ionescu E. Dellandr\u00e9a M. Schedl C. Demarty and L. Chen. 2015. The MediaEval 2015 Affective Impact of Movies Task. In MediaEval."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"crossref","unstructured":"Y. Su G. Lin J. Zhu and Q. Wu. 2020. Human Interaction Learning on 3D Skeleton Point Clouds for Video Violence Recognition. In ECCV.","DOI":"10.1007\/978-3-030-58548-8_5"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"crossref","unstructured":"S. Sudhakaran and O. Lanz. 2017. Learning to detect violent videos using convolutional long short-term memory. In IEEE AVSS. 1\u20136.","DOI":"10.1109\/AVSS.2017.8078468"},{"key":"e_1_3_2_1_51_1","unstructured":"M. Tan and Q. Le. 2019. EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks. In ICML. 6105\u20136114."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"crossref","unstructured":"D. Tran L. Bourdev R. Fergus L. Torresani and M. Paluri. 2015. Learning spatiotemporal features with 3d convolutional networks. In ICCV.","DOI":"10.1109\/ICCV.2015.510"},{"volume-title":"CEUR Workshop, Vol.\u00a01436","author":"Trigeorgis G.","key":"e_1_3_2_1_53_1","unstructured":"G. Trigeorgis, E. Coutinho, F. Ringeval, E. Marchi, S. Zafeiriou, and B. Schuller. 2015. The ICL-TUM-PASSAU approach for the MediaEval 2015\u201d affective impact of movies\u201d task. In CEUR Workshop, Vol.\u00a01436."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.3390\/s19112472"},{"key":"e_1_3_2_1_55_1","unstructured":"A. Vaswani N. Shazeer N. Parmar J. Uszkoreit L. Jones A. Gomez \u0141. Kaiser and I. Polosukhin. 2017. Attention is all you need. In NeurIPS. 6000\u20136010."},{"key":"e_1_3_2_1_56_1","unstructured":"M. Vlastelica S. Hayrapetyan M. Tapaswi and R. Stiefelhagen. 2015. KIT at MediaEval 2015-Evaluating Visual Cues for Affective Impact of Movies Task.. In MediaEval."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"crossref","unstructured":"H. Wang A. Kl\u00e4ser C. Schmid and C.-L. Liu. 2011. Action recognition by dense trajectories. In CVPR. 3169\u20133176.","DOI":"10.1109\/CVPR.2011.5995407"},{"key":"e_1_3_2_1_58_1","unstructured":"K. Xu J. Ba R. Kiros K. Cho A. Courville R. Salakhudinov R. Zemel and Y. Bengio. 2015. Show attend and tell: Neural image caption generation with visual attention. In ICML. 2048\u20132057."},{"key":"e_1_3_2_1_59_1","unstructured":"Y. Yi H. Wang B. Zhang and J. Yu. 2015. MIC-TJU in MediaEval 2015 Affective Impact of Movies Task. In MediaEval."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"crossref","unstructured":"A. Zeyer K. Irie R. Schl\u00fcter and He. Ney. 2018. Improved Training of End-to-end Attention Models for Speech Recognition. In Interspeech. 7\u201311.","DOI":"10.21437\/Interspeech.2018-1616"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"crossref","unstructured":"P. Zhou Q. Ding H. Luo and X. Hou. 2017. Violent interaction detection in video based on deep learning. In Journal of physics: conference series Vol.\u00a0844. 012044.","DOI":"10.1088\/1742-6596\/844\/1\/012044"}],"event":{"name":"ARES 2021: The 16th International Conference on Availability, Reliability and Security","acronym":"ARES 2021","location":"Vienna Austria"},"container-title":["Proceedings of the 16th International Conference on Availability, Reliability and Security"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3465481.3470059","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3465481.3470059","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:17:24Z","timestamp":1750191444000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3465481.3470059"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,8,17]]},"references-count":61,"alternative-id":["10.1145\/3465481.3470059","10.1145\/3465481"],"URL":"https:\/\/doi.org\/10.1145\/3465481.3470059","relation":{},"subject":[],"published":{"date-parts":[[2021,8,17]]},"assertion":[{"value":"2021-08-17","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}