{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,10]],"date-time":"2025-10-10T02:02:12Z","timestamp":1760061732876,"version":"3.37.3"},"reference-count":60,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2013,7,16]],"date-time":"2013-07-16T00:00:00Z","timestamp":1373932800000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Machine Vision and Applications"],"published-print":{"date-parts":[[2014,1]]},"DOI":"10.1007\/s00138-013-0525-x","type":"journal-article","created":{"date-parts":[[2013,7,15]],"date-time":"2013-07-15T15:59:36Z","timestamp":1373903976000},"page":"49-69","source":"Crossref","is-referenced-by-count":42,"title":["Multimedia event detection with multimodal feature fusion and temporal concept localization"],"prefix":"10.1007","volume":"25","author":[{"given":"Sangmin","family":"Oh","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Scott","family":"McCloskey","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ilseo","family":"Kim","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Arash","family":"Vahdat","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kevin J.","family":"Cannons","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hossein","family":"Hajimirsadeghi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Greg","family":"Mori","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"A. G. Amitha","family":"Perera","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Megha","family":"Pandey","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jason J.","family":"Corso","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2013,7,16]]},"reference":[{"key":"525_CR1","unstructured":"http:\/\/www.lscom.org\/"},{"key":"525_CR2","unstructured":"TRECVID 2011 Multimedia Event Detection Evaluation Plan Version 3.0. http:\/\/www.nist.gov\/itl\/iad\/mig\/upload\/MED11-EvalPlan-V03-20110801a.pdf"},{"key":"525_CR3","doi-asserted-by":"crossref","unstructured":"Bach, F.R., Lanckriet, G.R.G., Jordan, M.I.: Multiple kernel learning, conic duality, and the smo algorithm. In: ICML (2004)","DOI":"10.1145\/1015330.1015424"},{"key":"525_CR4","doi-asserted-by":"crossref","unstructured":"Bao, L., Cao, J., Zhang, Y., Li, J., yu Chen, M., Hauptmann, A.G.: Explicit and implicit concept-based video retrieval with bipartite graph propagation model. In: ACM Multimedia (2010)","DOI":"10.1145\/1873951.1874118"},{"key":"525_CR5","doi-asserted-by":"crossref","unstructured":"Blei, D.M., Jordan, M.I.: Modeling annotated data. In: ACM SIGIR, pp. 127\u2013134 (2003)","DOI":"10.1145\/860458.860460"},{"key":"525_CR6","doi-asserted-by":"crossref","unstructured":"Byun, B., Kim, I., Siniscalchi, S.M., Lee, C.H.: Consumer-level multimedia event detection through unsupervised audio signal modeling. In: InterSpeech (2012)","DOI":"10.21437\/Interspeech.2012-555"},{"key":"525_CR7","unstructured":"Cao, L., Chang, S.F., Codella, N., Cotton, C., Ellis, D., Gong, L., Hill, M., Hua, G., Kender, J., Merler, M., Mu, Y., Smith, J.R., Yu, F.X.: IBM research and Columbia University TRECVID-2012 multimedia event detection (MED), multimedia event recounting (MER), and semantic indexing (SIN) systems (2012)"},{"key":"525_CR8","doi-asserted-by":"crossref","unstructured":"Cao, L., Fei-Fei, L.: Spatially coherent latent topic model for concurrent segmentation and classification of objects and scenes. In: ICCV (2007)","DOI":"10.1109\/ICCV.2007.4408965"},{"issue":"3","key":"525_CR9","doi-asserted-by":"crossref","first-page":"27:1","DOI":"10.1145\/1961189.1961199","volume":"2","author":"CC Chang","year":"2011","unstructured":"Chang, C.C., Lin, C.J.: Libsvm: a library for support vector machines. ACM Trans. Intell. Syst. Technol. 2(3), 27:1\u201327:27 (2011)","journal-title":"ACM Trans. Intell. Syst. Technol."},{"key":"525_CR10","doi-asserted-by":"crossref","unstructured":"Dalal, N., Triggs, B.: Histograms of oriented gradients for human detection. In: CVPR (2005)","DOI":"10.1109\/CVPR.2005.177"},{"key":"525_CR11","doi-asserted-by":"crossref","unstructured":"Deng, J., Berg, A.C., Li, K., Fei-Fei, L.: What does classifying more than 10,000 image categories tell us? In: ECCV (2010)","DOI":"10.1007\/978-3-642-15555-0_6"},{"key":"525_CR12","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: CVPR (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"issue":"9","key":"525_CR13","doi-asserted-by":"crossref","first-page":"1627","DOI":"10.1109\/TPAMI.2009.167","volume":"32","author":"P Felzenszwalb","year":"2010","unstructured":"Felzenszwalb, P., Girshick, R., McAllester, D., Ramanan, D.: Object detection with discriminatively trained part based models. IEEE Trans. Pattern Anal. Mach. Intell. 32(9), 1627\u20131645 (2010)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"525_CR14","doi-asserted-by":"crossref","unstructured":"Feng, J., Zheng, Y., Yan, S.: Towards a universal detector by mining concepts with small semantic gaps. In: ACM Multimedia (2010)","DOI":"10.1145\/1873951.1874332"},{"key":"525_CR15","unstructured":"Feng, Y., Lapata, M.: Topic models for image annotation and text illustration. In: NAACL HLT (2010)"},{"key":"525_CR16","doi-asserted-by":"crossref","unstructured":"Gao, S., Wu, W., Lee, C.H., Chua, T.S.: A mfom learning approach to robust multiclass multi-label text categorization. In: ICML (2004)","DOI":"10.1145\/1015330.1015361"},{"key":"525_CR17","doi-asserted-by":"crossref","unstructured":"Guillaumin, M., Mensink, T., Verbeek, J., Schmid, C.: TagProp: discriminative metric learning in nearest neighbor models for image auto-annotation. In: ICCV (2009)","DOI":"10.1109\/ICCV.2009.5459266"},{"issue":"4","key":"525_CR18","doi-asserted-by":"crossref","first-page":"602","DOI":"10.1109\/JPROC.2008.916355","volume":"96","author":"AG Hauptmann","year":"2008","unstructured":"Hauptmann, A.G., Christel, M.G., Yan, R.: Video retrieval based on semantic concepts. Proc. IEEE 96(4), 602\u2013622 (2008)","journal-title":"Proc. IEEE"},{"key":"525_CR19","doi-asserted-by":"crossref","unstructured":"Hu, W., Xie, N., Li, L., Zeng, X., Maybank, S.J.: A survey on visual content-based video indexing and retrieval. IEEE Trans. Syst. Man Cybern. Part C 41(6), 797\u2013819 (2011). URL: http:\/\/dx.doi.org\/10.1109\/TSMCC.2011.2109710","DOI":"10.1109\/TSMCC.2011.2109710"},{"issue":"12","key":"525_CR20","doi-asserted-by":"crossref","first-page":"2270","DOI":"10.1016\/j.patcog.2005.01.012","volume":"38","author":"A Jain","year":"2005","unstructured":"Jain, A., Nandakumar, K., Ross, A.: Score normalization in multimodal biometric systems. Pattern Recogn. 38(12), 2270\u20132285 (2005)","journal-title":"Pattern Recogn."},{"key":"525_CR21","doi-asserted-by":"crossref","unstructured":"Jiang, L., Hauptmann, A.G., Xiang, G.: Leveraging high-level and low-level features for multimedia event detection. In: ACM-MM (2012)","DOI":"10.1145\/2393347.2393412"},{"key":"525_CR22","doi-asserted-by":"crossref","unstructured":"Jiang, W., Loui, A.C.: Audio-visual grouplet: temporal audio-visual interactions for general video concept classification. In: ACM Multimedia (2011)","DOI":"10.1145\/2072298.2072316"},{"key":"525_CR23","unstructured":"Jiang, Y.G., Zeng, X., Ye, G., Bhattacharya, S., Ellis, D., Shah, M., Chang, S.F.: Combining multiple modalities, contextual concepts, and temporal matching. In: NIST TRECVID Workshop (2010)"},{"key":"525_CR24","doi-asserted-by":"crossref","first-page":"2345","DOI":"10.1109\/5.726793","volume":"86","author":"S Katagiri","year":"1998","unstructured":"Katagiri, S., Juang, B.H., Lee, C.H.: Pattern recognition using a family of design algorithm based upon the generalized probabilistic descent method. Proc. IEEE 86, 2345\u20132373 (1998)","journal-title":"Proc. IEEE"},{"key":"525_CR25","unstructured":"Kim, I., Lee, C.H.: Optimization of average precision with maximal figure-of-merit learning. In: MLSP (2011)"},{"key":"525_CR26","doi-asserted-by":"crossref","unstructured":"Kim, I., Oh, S., Byun, B., Perera, A.G.A., Lee, C.H.: Explicit performance metric optimization for fusion-based video retrieval. In: ECCV Workshops, no. 3 (2012)","DOI":"10.1007\/978-3-642-33885-4_40"},{"key":"525_CR27","doi-asserted-by":"crossref","unstructured":"Kim, I., Oh, S., Byun, B., Perera, A.G.A., Lee, C.H.: Explicit performance metric optimization for fusion-based video retrieval. In: ECCV Workshop (2012)","DOI":"10.1007\/978-3-642-33885-4_40"},{"key":"525_CR28","doi-asserted-by":"crossref","first-page":"226","DOI":"10.1109\/34.667881","volume":"20","author":"J Kittler","year":"1998","unstructured":"Kittler, J., Hatef, M., Duin, R.P.W., Matas, J.: On combining classifiers. PAMI 20, 226\u2013239 (1998)","journal-title":"PAMI"},{"key":"525_CR29","doi-asserted-by":"crossref","unstructured":"Klaser, A., Marszalek, M., Schmid, C.: A spatio-temporal descriptor based on 3d-gradients. In: BMVC (2008)","DOI":"10.5244\/C.22.99"},{"key":"525_CR30","doi-asserted-by":"crossref","unstructured":"Lan, Z.Z., Bao, L., Yu, S.I., Liu, W., Hauptmann, A.G.: Double fusion for multimedia event detection. In: ICME (2012)","DOI":"10.1007\/978-3-642-27355-1_18"},{"key":"525_CR31","doi-asserted-by":"crossref","unstructured":"Le, Q., Zou, W., Yeung, S., Ng, A.: Learning hierarchical spatio-temporal features for action recognition with independent subspace analysis. In: CVPR (2011)","DOI":"10.1109\/CVPR.2011.5995496"},{"key":"525_CR32","unstructured":"Lee, C.H., Soong, F.K., Juang, B.H.: A segment model based approach to speech recognition. In: ICASSP (1988)"},{"key":"525_CR33","unstructured":"Lee, K., Ellis, D.P.W.: Audio-based semantic concept classification for consumer video. IEEE Trans. Audio Speech Lang. Process. 18(6), 1406\u20131416 (2010)"},{"key":"525_CR34","unstructured":"Li, L.J., Su, H., Xing, E.P., Li, F.F.: Object bank: A high-level image representation for scene classification & semantic feature sparsification. In: NIPS (2010)"},{"key":"525_CR35","doi-asserted-by":"crossref","unstructured":"Liu, J., McCloskey, S., Liu, Y.: Local expert forest of score fusion for video event classification. In: ECCV (2012)","DOI":"10.1007\/978-3-642-33715-4_29"},{"key":"525_CR36","doi-asserted-by":"crossref","unstructured":"Ma, A.J., Yuen, P.C.: Linear dependency modeling for feature fusion. In: ICCV, pp. 2041\u20132048 (2011)","DOI":"10.1109\/ICCV.2011.6126477"},{"key":"525_CR37","doi-asserted-by":"crossref","unstructured":"Maji, S., Berg, A.C., Malik, J.: Classification using intersection kernel support vector machines is efficient. In: CVPR (2008)","DOI":"10.1109\/CVPR.2008.4587630"},{"key":"525_CR38","doi-asserted-by":"crossref","unstructured":"Makadia, A., Pavlovic, V., Kumar, S.: A new baseline for image annotation. In: ECCV (2008)","DOI":"10.1007\/978-3-540-88690-7_24"},{"key":"525_CR39","doi-asserted-by":"crossref","unstructured":"Natarajan, P., Wu, S., Vitaladevuni, S.N.P., Zhuang, X., Tsakalidis, S., Park, U., Prasad, R., Natarajan, P.: Multimodal feature fusion for robust event detection in web videos. In: CVPR (2012)","DOI":"10.1109\/CVPR.2012.6247814"},{"key":"525_CR40","doi-asserted-by":"crossref","unstructured":"Niculescu-Mizil, A., Caruana, R.: Predicting good probabilities with supervised learning. In: ICML (2005)","DOI":"10.1145\/1102351.1102430"},{"issue":"3","key":"525_CR41","doi-asserted-by":"crossref","first-page":"145","DOI":"10.1023\/A:1011139631724","volume":"42","author":"A Oliva","year":"2001","unstructured":"Oliva, A., Torralba, A.: Modeling the shape of the scene: a holistic representation of the spatial envelope. Int. J. Comput. Vis. 42(3), 145\u2013175 (2001)","journal-title":"Int. J. Comput. Vis."},{"key":"525_CR42","unstructured":"Over, P., Awad, G., Michel, M., Fiscus, J., Antonishek, B., Smeaton, A.F., Kraaij, W., Qu\u00e9enot, G.: TRECVID 2011\u2014an overview of the goals, tasks, data, evaluation mechanisms and metrics. In: Proceedings of TRECVID 2011. NIST, USA (2011)"},{"key":"525_CR43","unstructured":"Over, P., Fiscus, J., Sanders, G., Shaw, B., Awad, G., Michel, M., Smeaton, A., Kraaij, W., Qu\u00e9enot, G.: TRECVID 2012-an overview of the goals, tasks, data, evaluation mechanisms and metrics. In: Proceedings of TRECVID 2012. NIST, USA (2012)"},{"key":"525_CR44","doi-asserted-by":"crossref","unstructured":"Putthividhya, D., Attias, H.T., Nagarajan, S.S.: Topic regression multi-model latent dirichlet allocation for image annotation. In: CVPR (2010)","DOI":"10.1109\/CVPR.2010.5540000"},{"key":"525_CR45","doi-asserted-by":"crossref","unstructured":"Reed, J., Lee, C.H.: On the importance of modeling temporal information in music tag annotation. In: ICASSP (2009)","DOI":"10.1109\/ICASSP.2009.4959973"},{"issue":"9","key":"525_CR46","doi-asserted-by":"crossref","first-page":"1582","DOI":"10.1109\/TPAMI.2009.154","volume":"32","author":"KEA Sande van de","year":"2010","unstructured":"van de Sande, K.E.A., Gevers, T., Snoek, C.G.M.: Evaluating color descriptors for object and scene recognition. PAMI 32(9), 1582\u20131596 (2010)","journal-title":"PAMI"},{"key":"525_CR47","doi-asserted-by":"crossref","unstructured":"Scheirer, W., Rocha, A., Micheals, R., Boult, T.: Robust fusion: extreme value theory for recognition score normalization. In: ECCV, pp. 481\u2013495 (2010)","DOI":"10.1007\/978-3-642-15558-1_35"},{"key":"525_CR48","doi-asserted-by":"crossref","unstructured":"Smith, J., Naphade, M., Natsev, A.: Multimedia semantic indexing using model vectors. In: ICME (2003)","DOI":"10.1109\/ICME.2003.1221649"},{"key":"525_CR49","doi-asserted-by":"crossref","unstructured":"Snoek, C.G.M., Worring, M., van Gemert, J.C., Geusebroek, J.M., Smeulders, A.W.: The challenge problem for automated detection of 101 semantic concepts in multimedia. In: Proceedings of ACM Multimedia (2006)","DOI":"10.1145\/1180639.1180727"},{"key":"525_CR50","doi-asserted-by":"crossref","unstructured":"Tamrakar, A., Ali, S., Yu, Q., Liu, J., Javed, O., Divakaran, A., Cheng, H., Sawhney, H.S.: Evaluation of low-level features and their combinations for complex event detection in open source videos. In: CVPR (2012)","DOI":"10.1109\/CVPR.2012.6248114"},{"issue":"9","key":"525_CR51","doi-asserted-by":"crossref","first-page":"1630","DOI":"10.1109\/TPAMI.2008.224","volume":"31","author":"OR Terrades","year":"2009","unstructured":"Terrades, O.R., Valveny, E., Tabbone, S.: Optimal classifier fusion in a non-bayesian probabilistic framework. PAMI 31(9), 1630\u20131644 (2009)","journal-title":"PAMI"},{"key":"525_CR52","doi-asserted-by":"crossref","unstructured":"Tsao, Y., Sun, H., Li, H., Lee, C.H.: An acoustic segment model approach to incorporating temporal information into speaker modeling for text-independent speaker recognition. In: ICASSP (2010)","DOI":"10.1109\/ICASSP.2010.5495617"},{"key":"525_CR53","doi-asserted-by":"crossref","unstructured":"Vedaldi, A., Gulshan, V., Varma, M., Zisserman, A.: Multiple kernels for object detection. In: ICCV (2009)","DOI":"10.1109\/ICCV.2009.5459183"},{"key":"525_CR54","doi-asserted-by":"crossref","unstructured":"Vedaldi, A., Zisserman, A.: Efficient additive kernels via explicit feature maps (2011)","DOI":"10.1109\/CVPR.2010.5539949"},{"key":"525_CR55","unstructured":"Wang, C., Blei, D.M., Fei-Fei, L.: Simultaneous image classification and annotation. In: CVPR (2009)"},{"key":"525_CR56","doi-asserted-by":"crossref","unstructured":"Wang, Y., Mori, G.: Max-margin hidden conditional random fields for human action recognition. In: CVPR (2009)","DOI":"10.1109\/CVPR.2009.5206709"},{"key":"525_CR57","doi-asserted-by":"crossref","unstructured":"Xiao, J., Hays, J., Ehinger, K., Oliva, A., Torralba, A.: SUN database: large-scale scene recognition from abbey to zoo. In: CVPR (2010)","DOI":"10.1109\/CVPR.2010.5539970"},{"key":"525_CR58","unstructured":"Yang, W., Wang, Y., Vahdat, A., Mori, G.: Kernel latent svm for visual recognition. In: Advances in Neural Information Processing Systems (NIPS) (2012)"},{"key":"525_CR59","unstructured":"Ye, G., Liu, D., Jhuo, I.H., Chang, S.F.: Robust late fusion with rank minimization. In: CVPR (2012)"},{"key":"525_CR60","doi-asserted-by":"crossref","unstructured":"Zhang, D., Chen, X., Lee, W.S.: Text classification with kernels on the multinomial manifold. In: SIGIR (2005)","DOI":"10.1145\/1076034.1076081"}],"container-title":["Machine Vision and Applications"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s00138-013-0525-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s00138-013-0525-x\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s00138-013-0525-x","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,3,1]],"date-time":"2022-03-01T01:50:30Z","timestamp":1646099430000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s00138-013-0525-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2013,7,16]]},"references-count":60,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2014,1]]}},"alternative-id":["525"],"URL":"https:\/\/doi.org\/10.1007\/s00138-013-0525-x","relation":{},"ISSN":["0932-8092","1432-1769"],"issn-type":[{"type":"print","value":"0932-8092"},{"type":"electronic","value":"1432-1769"}],"subject":[],"published":{"date-parts":[[2013,7,16]]}}}