{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,17]],"date-time":"2025-10-17T13:43:02Z","timestamp":1760708582280},"reference-count":41,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2013,6,1]],"date-time":"2013-06-01T00:00:00Z","timestamp":1370044800000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Machine Vision and Applications"],"published-print":{"date-parts":[[2014,1]]},"DOI":"10.1007\/s00138-013-0521-1","type":"journal-article","created":{"date-parts":[[2013,5,31]],"date-time":"2013-05-31T05:49:16Z","timestamp":1369979356000},"page":"71-84","source":"Crossref","is-referenced-by-count":19,"title":["Human interaction categorization by using audio-visual cues"],"prefix":"10.1007","volume":"25","author":[{"given":"M. J.","family":"Mar\u00edn-Jim\u00e9nez","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"R.","family":"Mu\u00f1oz-Salinas","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"E.","family":"Yeguas-Bolivar","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"N.","family":"P\u00e9rez de la Blanca","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2013,6,1]]},"reference":[{"key":"521_CR1","doi-asserted-by":"crossref","unstructured":"Bakker, E., Lew, M.: Semantic video retrieval using audio analysis. In: Lew, M., Sebe, N., Eakins, J. (eds.) Image and video retrieval. Lecture Notes in Computer Science, vol. 2383, pp. 201\u2013218. Springer, International Conference on Image and Video Retrieval, London (2002)","DOI":"10.1007\/3-540-45479-9_29"},{"key":"521_CR2","unstructured":"Bredin, H., Koenig, L., Farinas, J.: Irit at trecvid 2010: Hidden markov models for context-aware late fusion of multiple audio classifiers. In: TRECVID 2010 Notebook papers (2010)"},{"key":"521_CR3","unstructured":"Dalal, N., Triggs, B.: Histograms of oriented gradients for human detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, vol. 1, pp. 886\u2013893. IEEE Computer Society, Washington, DC (2005)"},{"key":"521_CR4","doi-asserted-by":"crossref","unstructured":"Dalal, N., Triggs, B., Schmid, C.: Human detection using oriented histograms of flow and appearance. In: Proceedings of the European Conference on Computer Vision (2006)","DOI":"10.1007\/11744047_33"},{"issue":"4","key":"521_CR5","doi-asserted-by":"crossref","first-page":"357","DOI":"10.1109\/TASSP.1980.1163420","volume":"28","author":"S Davis","year":"1980","unstructured":"Davis, S., Mermelstein, P.: Comparison of parametric representations for monosyllabic word recognition in continuously spoken sentences. IEEE Trans. Acoust. Speech Signal Process. 28(4), 357\u2013366 (1980)","journal-title":"IEEE Trans. Acoust. Speech Signal Process."},{"key":"521_CR6","unstructured":"Delaitre, V., Sivic, J., Laptev, I.: Learning person-object interactions for action recognition in still images. In: Advances in Neural Information Processing Systems (2011)"},{"issue":"12","key":"521_CR7","doi-asserted-by":"crossref","first-page":"2247","DOI":"10.1109\/TPAMI.2007.70711","volume":"29","author":"L Gorelick","year":"2007","unstructured":"Gorelick, L., Blank, M., Shechtman, E., Irani, M., Basri, R.: Actions as space-time shapes. Trans. Pattern Anal. Mach. Intell. 29(12), 2247\u20132253 (2007)","journal-title":"Trans. Pattern Anal. Mach. Intell."},{"key":"521_CR8","unstructured":"Inoue, N., Wada, T., Kamishima, Y., Shinoda, K., Sato, S.: Tokyotech+canon at trecvid 2011. In: TRECVID 2011 Notebook papers (2011)"},{"key":"521_CR9","doi-asserted-by":"crossref","unstructured":"Jiang, Y.G., Ye, G., Chang, S.F., Ellis, D., Loui, A.C.: Consumer video understanding: a benchmark database and an evaluation of human and machine performance. In: Proceedings of ACM International Conference on Multimedia Retrieval (ICMR), oral session (2011)","DOI":"10.1145\/1991996.1992025"},{"issue":"2\/3","key":"521_CR10","doi-asserted-by":"crossref","first-page":"107","DOI":"10.1007\/s11263-005-1838-7","volume":"64","author":"I Laptev","year":"2005","unstructured":"Laptev, I.: On space-time interest points. Int. J. Comput. Vis. 64(2\/3), 107\u2013123 (2005)","journal-title":"Int. J. Comput. Vis."},{"key":"521_CR11","doi-asserted-by":"crossref","unstructured":"Laptev, I., Marszalek, M., Schmid, C., Rozenfeld, B.: Learning realistic human actions from movies. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2008)","DOI":"10.1109\/CVPR.2008.4587756"},{"key":"521_CR12","doi-asserted-by":"crossref","unstructured":"Laptev, I., P\u00e9rez, P.: Retrieving actions in movies. In: Proceedings of the International Conference on Computer Vision, pp. 1\u20138 (2007)","DOI":"10.1109\/ICCV.2007.4409105"},{"key":"521_CR13","unstructured":"Lartillot, O., Toiviainen, P.: MIR in Matlab (ii): a toolbox for musical feature extraction from audio. In: ISMIR, pp. 127\u2013130 (2007)"},{"key":"521_CR14","unstructured":"Li, Y., Mou, L., Jiang, M., Su, C., Fang, X., Qian, M., Tian, Y., Wang, Y., Huang, T., Gao, W.: Pku-idm at trecvid 2010: copy detection with visual-audio feature fusion and sequential pyramid matching. In: TRECVID 2010 Notebook papers (2010)"},{"key":"521_CR15","unstructured":"MacQueen, J.B.: Some methods for classification and analysis of multivariate observations. In: Cam, L.M.L., Neyman, J. (eds.) Proceedings of the fifth Berkeley Symposium on Mathematical Statistics and Probability, vol. 1, pp. 281\u2013297. University of California Press (1967)"},{"issue":"1","key":"521_CR16","doi-asserted-by":"crossref","first-page":"50","DOI":"10.1214\/aoms\/1177730491","volume":"18","author":"HB Mann","year":"1947","unstructured":"Mann, H.B., Whitney, D.R.: On a test of whether one of two random variables is stochastically larger than the other. Ann. Math. Stat. 18(1), 50\u201360 (1947)","journal-title":"Ann. Math. Stat."},{"key":"521_CR17","doi-asserted-by":"crossref","unstructured":"Marin-Jimenez, M., Zisserman, A., Ferrari, V.: \u201cHere\u2019s looking at you, kid\u201d. Detecting people looking at each other in videos. In: Proceedings of the British Machine Vision Conference (2011)","DOI":"10.5244\/C.25.22"},{"key":"521_CR18","doi-asserted-by":"crossref","unstructured":"Marsza\u0142ek, M., Laptev, I., Schmid, C.: Actions in context. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2009)","DOI":"10.1109\/CVPR.2009.5206557"},{"issue":"3","key":"521_CR19","doi-asserted-by":"crossref","first-page":"305","DOI":"10.1109\/TPAMI.2005.49","volume":"27","author":"I McCowan","year":"2005","unstructured":"McCowan, I., Gatica-Perez, D., Bengio, S., Lathoud, G., Barnard, M., Zhang, D.: Automatic analysis of multimodal group actions in meetings. IEEE Trans. Pattern Anal. Mach. Intell. 27(3), 305\u2013317 (2005)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"12","key":"521_CR20","doi-asserted-by":"crossref","first-page":"2441","DOI":"10.1109\/TPAMI.2012.24","volume":"34","author":"A Patron-Perez","year":"2012","unstructured":"Patron-Perez, A., Marszalek, M., Reid, I., Zisserman, A.: Structured learning of human interactions in TV shows. IEEE Trans. Pattern Anal. Mach. Intell. 34(12), 2441\u20132453 (2012)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"521_CR21","unstructured":"Perera, A.G.A., Oh, S., Leotta, M., Kim, I., Byun, B., Lee, C.H., McCloskey, S., Liu, J., Miller, B., Huang, Z.F., Vahdat, A., Yang, W., Mori, G., Tang, K., Koller, D., Fei-Fei, L., Li, K., Chen, G., Corso, J., Fu, Y., Srihari, R.: Genie trecvid2011 multimedia event detection: late-fusion approaches to combine multiple audio-visual features. In: TRECVID 2011 Notebook papers (2011)"},{"key":"521_CR22","unstructured":"Press, W.H., Teukolsky, S.A., Vetterling, W., Flannery, B.P.: Numerical recipes in C++: the art of scientific computing, 2 edn. Cambridge University Press, Cambridge (2002)"},{"key":"521_CR23","doi-asserted-by":"crossref","unstructured":"Reid, I., Benfold, B., Patron, A., Sommerlade, E.: Understanding interactions and guiding visual surveillance by tracking attention. In: Koch, R., Huang, F. (eds.) Computer Vision\u2014ACCV 2010 Workshops. Lecture Notes in Computer Science, vol. 6468, pp. 380\u2013389. Springer, Berlin\/Heidelberg (2011)","DOI":"10.1007\/978-3-642-22822-3_38"},{"key":"521_CR24","doi-asserted-by":"crossref","unstructured":"Ryoo, M., Aggarwal, J.: Spatio-temporal relationship match: Video structure comparison for recognition of complex human activities. In: Proceedings of the International Conference on Computer Vision, pp. 1593\u20131600 (2009)","DOI":"10.1109\/ICCV.2009.5459361"},{"key":"521_CR25","doi-asserted-by":"crossref","unstructured":"Sch\u00fcldt, C., Laptev, I., Caputo, B.: Recognizing human actions: a local SVM approach. In: Proceedings of the International Conference on Pattern Recognition, vol. 3, pp. 32\u201336, Cambridge (2004)","DOI":"10.1109\/ICPR.2004.1334462"},{"key":"521_CR26","unstructured":"Sidiropoulos, P., Mezaris, V., Kompatsiaris, I., Meinedo, H., Bugalho, M., Trancoso, I.: On the use of audio events for improving video scene segmentation. In: 2010 11th International Workshop on Image Analysis for Multimedia Interactive Services (WIAMIS), pp. 1\u20134 (2010)"},{"key":"521_CR27","doi-asserted-by":"crossref","unstructured":"Sivic, J., Zisserman, A.: Video Google: a text retrieval approach to object matching in videos. In: Proceedings of the International Conference on Computer Vision, vol. 2, pp. 1470\u20131477 (2003)","DOI":"10.1109\/ICCV.2003.1238663"},{"key":"521_CR28","doi-asserted-by":"crossref","unstructured":"Smeaton, A.F., Over, P., Kraaij, W.: Evaluation campaigns and TRECVid. In: MIR \u201906: Proceedings of the 8th ACM International Workshop on Multimedia Information Retrieval, pp. 321\u2013330. ACM Press, New York (2006)","DOI":"10.1145\/1178677.1178722"},{"key":"521_CR29","doi-asserted-by":"crossref","first-page":"185","DOI":"10.1121\/1.1915893","volume":"8","author":"S Stevens","year":"1937","unstructured":"Stevens, S., Volkmann, J., Newman, E.: A scale for the measurement of the psychological magnitude of pitch. J. Acoust. Soc. Am. 8, 185\u2013190 (1937)","journal-title":"J. Acoust. Soc. Am."},{"key":"521_CR30","unstructured":"Turaga, P., Chellappa, R., Subrahmanian, V.S., Udrea, O.: Machine recognition of human activities: a survey. IEEE Trans. Circ. Syst. Video Technol. 18(11), 1473\u20131488 (2008)"},{"key":"521_CR31","unstructured":"Tzanetakis, G., Chen, M.: Building audio classification for broadcast news retrieval. In: Proceedings of WIAMIS (2004)"},{"key":"521_CR32","doi-asserted-by":"crossref","unstructured":"Varma, M., Babu, B.R.: More generality in efficient multiple kernel learning. In: ICML, p. 134 (2009)","DOI":"10.1145\/1553374.1553510"},{"key":"521_CR33","unstructured":"Vedaldi, A., Fulkerson, B.: VLFeat: an open and portable library of computer vision algorithms. http:\/\/www.vlfeat.org\/ (2008)"},{"issue":"3","key":"521_CR34","doi-asserted-by":"crossref","first-page":"480","DOI":"10.1109\/TPAMI.2011.153","volume":"34","author":"A Vedaldi","year":"2012","unstructured":"Vedaldi, A., Zisserman, A.: Efficient additive kernels via explicit feature maps. IEEE Trans. Pattern Anal. Mach. Intell. 34(3), 480\u2013492 (2012)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"521_CR35","unstructured":"Vishwanathan, S.V.N., Sun, Z., Theera-Ampornpunt, N., Varma, M.: Multiple kernel learning and the SMO algorithm. In: Advances in Neural Information Processing Systems (2010)"},{"key":"521_CR36","doi-asserted-by":"crossref","unstructured":"Wang, H., Ullah, M., Kl\u00e4ser, A., Laptev, I., Schmid, C.: Evaluation of local spatio-temporal features for action recognition. In: Proceedings of the British Machine Vision Conference, p. 127 (2009)","DOI":"10.5244\/C.23.124"},{"key":"521_CR37","doi-asserted-by":"crossref","unstructured":"Weinland, D., Ronfard, R., Boyer, E.: Free viewpoint action recognition using motion history volumes. Comput. Vis. Image Underst. 104(2\u20133), 249\u2013257 (2006)","DOI":"10.1016\/j.cviu.2006.07.013"},{"issue":"6","key":"521_CR38","doi-asserted-by":"crossref","first-page":"80","DOI":"10.2307\/3001968","volume":"1","author":"F Wilcoxon","year":"1945","unstructured":"Wilcoxon, F.: Individual comparisons by ranking methods. Biom. Bull. 1(6), 80\u201383 (1945)","journal-title":"Biom. Bull."},{"key":"521_CR39","doi-asserted-by":"crossref","unstructured":"Yao, B., Jiang, X., Khosla, A., Lin, A., Guibas, L., Fei-Fei, L.: Action recognition by learning bases of action attributes and parts. In: Proceedings of the International Conference on Computer Vision, Barcelona, Spain (2011)","DOI":"10.1109\/ICCV.2011.6126386"},{"key":"521_CR40","doi-asserted-by":"crossref","unstructured":"Ye, G., Jhuo, I.H., Liu, D., Jiang, Y.G., Lee, D.T., Chang, S.F.: Joint audio-visual bi-modal codewords for video event detection. In: ICMR, p. 39 (2012)","DOI":"10.1145\/2324796.2324843"},{"key":"521_CR41","unstructured":"Ye, G., Liu, D., Jhuo, I.H., Chang, S.F.: Robust late fusion with rank minimization. In: Proceedings of the IEEE Conference on Computer Vision and, Pattern Recognition, pp. 3021\u20133028 (2012)"}],"container-title":["Machine Vision and Applications"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s00138-013-0521-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s00138-013-0521-1\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s00138-013-0521-1","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,7,14]],"date-time":"2019-07-14T08:33:04Z","timestamp":1563093184000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s00138-013-0521-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2013,6,1]]},"references-count":41,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2014,1]]}},"alternative-id":["521"],"URL":"https:\/\/doi.org\/10.1007\/s00138-013-0521-1","relation":{},"ISSN":["0932-8092","1432-1769"],"issn-type":[{"value":"0932-8092","type":"print"},{"value":"1432-1769","type":"electronic"}],"subject":[],"published":{"date-parts":[[2013,6,1]]}}}