{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,4]],"date-time":"2026-04-04T11:01:27Z","timestamp":1775300487105,"version":"3.50.1"},"reference-count":47,"publisher":"Springer Science and Business Media LLC","issue":"11","license":[{"start":{"date-parts":[[2018,12,7]],"date-time":"2018-12-07T00:00:00Z","timestamp":1544140800000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"published-print":{"date-parts":[[2019,6]]},"DOI":"10.1007\/s11042-018-6959-4","type":"journal-article","created":{"date-parts":[[2018,12,7]],"date-time":"2018-12-07T01:07:26Z","timestamp":1544144846000},"page":"15623-15646","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":10,"title":["Correlation based feature fusion for the temporal video scene segmentation task"],"prefix":"10.1007","volume":"78","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5175-7551","authenticated-orcid":false,"given":"Rodrigo Mitsuo","family":"Kishi","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tiago Henrique","family":"Trojahn","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rudinei","family":"Goularte","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2018,12,7]]},"reference":[{"issue":"6","key":"6959_CR1","doi-asserted-by":"publisher","first-page":"345","DOI":"10.1007\/s00530-010-0182-0","volume":"16","author":"PK Atrey","year":"2010","unstructured":"Atrey PK, Hossain MA, El Saddik A, Kankanhalli MS (2010) Multimodal fusion for multimedia analysis: a survey. Multimedia Syst 16(6):345\u2013379. https:\/\/doi.org\/10.1007\/s00530-010-0182-0","journal-title":"Multimedia Syst"},{"key":"6959_CR2","doi-asserted-by":"publisher","unstructured":"Baraldi L, Grana C, Cucchiara R (2015) A deep siamese network for scene detection in broadcast videos. In: Proceedings of the 23rd ACM international conference on multimedia, MM \u201915, pp 1199\u20131202. ACM, New York. https:\/\/doi.org\/10.1145\/2733373.2806316","DOI":"10.1145\/2733373.2806316"},{"key":"6959_CR3","doi-asserted-by":"crossref","unstructured":"Baraldi L, Grana C, Cucchiara R (2015) Measuring scene detection performance, pp 395\u2013403, Springer International Publishing, Cham","DOI":"10.1007\/978-3-319-19390-8_45"},{"key":"6959_CR4","unstructured":"BBC: Planet earth. http:\/\/www.bbc.co.uk\/programmes\/b006mywy (2006). [Online; accessed 25-may-2018]"},{"key":"6959_CR5","doi-asserted-by":"crossref","unstructured":"Bromley J, Guyon I, LeCun Y, S\u00e4ckinger E, Shah R (1993) Signature verification using a \u201csiamese\u201d time delay neural network. In: Proceedings of the 6th international conference on neural information processing systems, NIPS\u201993, pp 737\u2013744. Morgan Kaufmann Publishers Inc., San Francisco. http:\/\/dl.acm.org\/citation.cfm?id=2987189.2987282","DOI":"10.1142\/S0218001493000339"},{"key":"6959_CR6","doi-asserted-by":"publisher","unstructured":"Chasanis V, Kalogeratos A, Likas A (2009) Movie segmentation into scenes and chapters using locally weighted bag of visual words. In: Proceedings of the ACM international conference on image and video retrieval, CIVR \u201909, pp 35:1\u201335:7. https:\/\/doi.org\/10.1145\/1646396.1646439 . ACM, New York","DOI":"10.1145\/1646396.1646439"},{"key":"6959_CR7","unstructured":"Csurka G, Dance CR, Fan L, Willamowski J, Bray C (2004) Visual categorization with bags of keypoints. In: Workshop on statistical learning in computer vision, ECCV, pp 1\u201322"},{"key":"6959_CR8","unstructured":"Davis SB, Mermelstein P (1980) Comparison of parametric representations for monosyllabic word recognition in continuously spoken sentences. IEEE transactions on acoustics, speech and signal processing, pp 357\u2013366"},{"issue":"5","key":"6959_CR9","doi-asserted-by":"publisher","first-page":"427","DOI":"10.1007\/s00530-013-0306-4","volume":"19","author":"M Del Fabro","year":"2013","unstructured":"Del Fabro M, B\u00f6sz\u00f6rmenyi L (2013) State-of-the-art and future challenges in video scene detection: a survey. Multimedia Syst 19(5):427\u2013454. https:\/\/doi.org\/10.1007\/s00530-013-0306-4","journal-title":"Multimedia Syst"},{"issue":"2","key":"6959_CR10","doi-asserted-by":"publisher","first-page":"325","DOI":"10.1007\/s11042-009-0325-5","volume":"47","author":"M Ellouze","year":"2010","unstructured":"Ellouze M, Boujemaa N, Alimi AM (2010) Scene pathfinder: unsupervised clustering techniques for movie scenes extraction. Multimedia Tools Appl 47(2):325\u2013346. https:\/\/doi.org\/10.1007\/s11042-009-0325-5","journal-title":"Multimedia Tools Appl"},{"key":"6959_CR11","unstructured":"Gao G, Ma H (2012) Multi-modality movie scene detection using kernel canonical correlation analysis. In: 2012 21st International Conference on Pattern recognition (ICPR), pp 3074\u20133077"},{"issue":"3","key":"6959_CR12","doi-asserted-by":"publisher","first-page":"381","DOI":"10.1016\/S0306-4573(98)00067-3","volume":"35","author":"JM Gauch","year":"1999","unstructured":"Gauch JM, Gauch S, Bouix S, Zhu X (1999) Real time video scene detection and classification. Inf Process Manag 35(3):381\u2013400","journal-title":"Inf Process Manag"},{"issue":"9","key":"6959_CR13","doi-asserted-by":"publisher","first-page":"1984","DOI":"10.1109\/TIFS.2016.2569061","volume":"11","author":"M Haghighat","year":"2016","unstructured":"Haghighat M, Abdel-Mottaleb M, Alhalabi W (2016) Discriminant correlation analysis: Real-time feature level fusion for multimodal biometric recognition. IEEE Trans Inf Forensic Secur 11(9):1984\u20131996. https:\/\/doi.org\/10.1109\/TIFS.2016.2569061","journal-title":"IEEE Trans Inf Forensic Secur"},{"key":"6959_CR14","doi-asserted-by":"publisher","unstructured":"Han B, Wu W (2011) Video scene segmentation using a novel boundary evaluation criterion and dynamic programming. In: 2011 IEEE International conference on multimedia and expo, pp 1\u20136. https:\/\/doi.org\/10.1109\/ICME.2011.6012001","DOI":"10.1109\/ICME.2011.6012001"},{"issue":"12","key":"6959_CR15","doi-asserted-by":"publisher","first-page":"2639","DOI":"10.1162\/0899766042321814","volume":"16","author":"DR Hardoon","year":"2004","unstructured":"Hardoon DR, Szedmak S, Shawe-Taylor J (2004) Canonical correlation analysis: an overview with application to learning methods. Neural Comput 16(12):2639\u20132664. https:\/\/doi.org\/10.1162\/0899766042321814","journal-title":"Neural Comput"},{"key":"6959_CR16","doi-asserted-by":"crossref","unstructured":"Hare J, Samangooei S, Dupplaw D (2011) Openimaj and imageterrier: Java libraries and tools for scalable multimedia analysis and indexing of images. In: ACM Multimedia 2011, pp 691\u2013694. ACM. Event Dates: 28\/11\/2011 until 1\/12\/2011. http:\/\/eprints.soton.ac.uk\/273040\/","DOI":"10.1145\/2072298.2072421"},{"issue":"1","key":"6959_CR17","doi-asserted-by":"publisher","first-page":"33","DOI":"10.1007\/s00138-013-0567-0","volume":"25","author":"IH Jhuo","year":"2014","unstructured":"Jhuo IH, Ye G, Gao S, Liu D, Jiang YG, Lee DT, Chang SF (2014) Discovering joint audio\u2013visual codewords for video event detection. Mach Vis Appl 25 (1):33\u201347","journal-title":"Mach Vis Appl"},{"key":"6959_CR18","doi-asserted-by":"crossref","unstructured":"Kender JR, Yeo BL (1998) Video scene segmentation via continuous video coherence. In: Proceedings of the IEEE computer society conference on computer vision and pattern recognition, CVPR \u201998, pp 367\u2013. IEEE Computer Society, Washington, DC, USA","DOI":"10.1109\/CVPR.1998.698632"},{"key":"6959_CR19","unstructured":"Koprinska I, Carrato S (2001) Temporal video segmentation: a survey. In: Signal processing: image communication, pp 477\u2013500"},{"key":"6959_CR20","doi-asserted-by":"publisher","unstructured":"Kurcius JJ, Breckon TP (2014) Using compressed audio-visual words for multi-modal scene classification. In: 2014 International workshop on computational intelligence for multimedia understanding (IWCIM), pp 1\u20135. https:\/\/doi.org\/10.1109\/IWCIM.2014.7008808","DOI":"10.1109\/IWCIM.2014.7008808"},{"key":"6959_CR21","unstructured":"LeCun Y, Bengio Y (1998) The handbook of brain theory and neural networks. MIT Press, Cambridge. http:\/\/dl.acm.org\/citation.cfm?id=303568.303704"},{"key":"6959_CR22","doi-asserted-by":"publisher","first-page":"129","DOI":"10.1109\/TIT.1982.1056489","volume":"28","author":"SP Lloyd","year":"1982","unstructured":"Lloyd SP (1982) Least squares quantization in pcm. IEEE Trans Inf Theory 28:129\u2013137","journal-title":"IEEE Trans Inf Theory"},{"issue":"2","key":"6959_CR23","first-page":"194","volume":"5","author":"BL Lopes","year":"2014","unstructured":"Lopes BL, Trojahn TH, Goularte R (2014) Video scene detection by multimodal bag of features. J Inf Data Manag 5(2):194","journal-title":"J Inf Data Manag"},{"issue":"2","key":"6959_CR24","doi-asserted-by":"publisher","first-page":"91","DOI":"10.1023\/B:VISI.0000029664.99615.94","volume":"60","author":"DG Lowe","year":"2004","unstructured":"Lowe DG (2004) Distinctive image features from scale-invariant keypoints. Int J Comput Vis 60(2):91\u2013110","journal-title":"Int J Comput Vis"},{"key":"6959_CR25","unstructured":"Mikolov T, Sutskever I, Chen K, Corrado G, Dean J (2013) Distributed representations of words and phrases and their compositionality. In: Proceedings of the 26th international conference on neural information processing systems - Volume 2, NIPS\u201913, pp 3111\u20133119. Curran Associates Inc., USA. http:\/\/dl.acm.org\/citation.cfm?id=2999792.2999959"},{"key":"6959_CR26","unstructured":"Rabiner L, Juang BH (1993) Fundamentals of speech recognition. Prentice-hall, inc., upper saddle river, NJ USA"},{"key":"6959_CR27","volume-title":"Emotion recognition using speech features","author":"KS Rao","year":"2012","unstructured":"Rao KS, Koolagudi SG (2012) Emotion recognition using speech features. Springer Publishing Company, Incorporated, New York"},{"key":"6959_CR28","doi-asserted-by":"publisher","unstructured":"Rasheed Z, Shah M (2003) Scene detection in hollywood movies and tv shows. In: Proceedings of the 2003 IEEE computer society conference on computer vision and pattern recognition, 2003. vol 2, pp II\u2013343\u20138 vol 2. https:\/\/doi.org\/10.1109\/CVPR.2003.1211489","DOI":"10.1109\/CVPR.2003.1211489"},{"key":"6959_CR29","unstructured":"Rasiwasia N, Mahajan D, Mahadevan V, Aggarwal G (2014) Cluster canonical correlation analysis. In: Kaski S, Corander J (eds) Proceedings of the seventeenth international conference on artificial intelligence and statistics, Proceedings of machine learning research, vol 33, pp 823-831. PMLR, Reykjavik, Iceland"},{"key":"6959_CR30","doi-asserted-by":"publisher","unstructured":"Saraceno C, Leonardi R (1997) Audio as a support to scene change detection and characterization of video sequences. In: 1997 IEEE international conference on acoustics, speech, and signal processing, 1997. ICASSP-97. vol 4, pp 2597\u20132600 vol 4. https:\/\/doi.org\/10.1109\/ICASSP.1997.595320","DOI":"10.1109\/ICASSP.1997.595320"},{"issue":"8","key":"6959_CR31","doi-asserted-by":"publisher","first-page":"1163","DOI":"10.1109\/TCSVT.2011.2138830","volume":"21","author":"P Sidiropoulos","year":"2011","unstructured":"Sidiropoulos P, Mezaris V, Kompatsiaris I, Meinedo H, Bugalho M, Trancoso I (2011) Temporal video segmentation to scenes using high-level audiovisual features. IEEE Trans Cir Sys Video Technol 21(8):1163\u20131177. https:\/\/doi.org\/10.1109\/TCSVT.2011.2138830","journal-title":"IEEE Trans Cir Sys Video Technol"},{"issue":"12","key":"6959_CR32","doi-asserted-by":"publisher","first-page":"1349","DOI":"10.1109\/34.895972","volume":"22","author":"AWM Smeulders","year":"2000","unstructured":"Smeulders AWM, Worring M, Santini S, Gupta A, Jain R (2000) Content-based image retrieval at the end of the early years. IEEE Trans Pattern Anal Mach Intell 22(12):1349\u20131380. https:\/\/doi.org\/10.1109\/34.895972","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"6959_CR33","doi-asserted-by":"publisher","unstructured":"Snoek CGM, Worring M (2002) A review on multimodal video indexing. In: Proceedings of the 2002 IEEE International Conference on Multimedia and expo, 2002. ICME \u201902. vol 2, pp 21\u201324 vol 2. https:\/\/doi.org\/10.1109\/ICME.2002.1035364","DOI":"10.1109\/ICME.2002.1035364"},{"issue":"1","key":"6959_CR34","doi-asserted-by":"publisher","first-page":"11","DOI":"10.1007\/BF00130487","volume":"7","author":"MJ Swain","year":"1991","unstructured":"Swain MJ, Ballard DH (1991) Color indexing. Int J Comput Vis 7(1):11\u201332. https:\/\/doi.org\/10.1007\/BF00130487","journal-title":"Int J Comput Vis"},{"issue":"4","key":"6959_CR35","doi-asserted-by":"publisher","first-page":"492","DOI":"10.1109\/TMM.2002.802021","volume":"4","author":"J Vendrig","year":"2002","unstructured":"Vendrig J, Worring M (2002) Systematic evaluation of logical story unit segmentation. IEEE Trans Multimedia 4(4):492\u2013499. https:\/\/doi.org\/10.1109\/TMM.2002.802021","journal-title":"IEEE Trans Multimedia"},{"issue":"4","key":"6959_CR36","doi-asserted-by":"publisher","first-page":"510","DOI":"10.1109\/LSP.2016.2611485","volume":"24","author":"X Wang","year":"2017","unstructured":"Wang X, Gao L, Song J, Shen H (2017) Beyond frame-level cnn: saliency-aware 3-d cnn with lstm for video action recognition. IEEE Sig Process Lett 24(4):510\u2013514. https:\/\/doi.org\/10.1109\/LSP.2016.2611485","journal-title":"IEEE Sig Process Lett"},{"key":"6959_CR37","doi-asserted-by":"publisher","first-page":"438","DOI":"10.1016\/j.neucom.2017.08.063","volume":"275","author":"X Wang","year":"2018","unstructured":"Wang X, Gao L, Song J, Zhen X, Sebe N, Shen HT (2018) Deep appearance and motion learning for egocentric activity recognition. Neurocomputing 275:438\u2013447. https:\/\/doi.org\/10.1016\/j.neucom.2017.08.063 . http:\/\/www.sciencedirect.com\/science\/article\/pii\/S0925231217314935","journal-title":"Neurocomputing"},{"issue":"3","key":"6959_CR38","doi-asserted-by":"publisher","first-page":"634","DOI":"10.1109\/TMM.2017.2749159","volume":"20","author":"X Wang","year":"2018","unstructured":"Wang X, Gao L, Wang P, Sun X, Liu X (2018) Two-stream 3-d convnet fusion for action recognition in videos with arbitrary size and length. IEEE Trans Multimedia 20(3):634\u2013644. https:\/\/doi.org\/10.1109\/TMM.2017.2749159","journal-title":"IEEE Trans Multimedia"},{"key":"6959_CR39","doi-asserted-by":"publisher","unstructured":"Wu S, Jin M (2015) Study on a new video scene segmentation algorithm. Appl Math Inf Sci 9 (1):361\u2013368. https:\/\/doi.org\/10.12785\/amis\/090142 . https:\/\/www.scopus.com\/inward\/record.uri?eid=2-s2.0-84907246427&partnerID=40&md5=dd07505c1071cd1603e5206c25e41311 . Cited By 0","DOI":"10.12785\/amis\/090142"},{"key":"6959_CR40","doi-asserted-by":"publisher","unstructured":"Xi W, Fox EA, Fan W, Zhang B, Chen Z, Yan J, Zhuang D (2005) Simfusion: Measuring similarity using unified relationship matrix. In: Proceedings of the 28th annual international ACM SIGIR conference on research and development in information retrieval, SIGIR \u201905, pp 130\u2013137. ACM, New York. https:\/\/doi.org\/10.1145\/1076034.1076059","DOI":"10.1145\/1076034.1076059"},{"key":"6959_CR41","doi-asserted-by":"crossref","unstructured":"Xie L, Shen J, Han J, Zhu L, Shao L (2017) Dynamic multi-view hashing for online image retrieval. In: Proceedings of the 26th international joint conference on artificial intelligence, IJCAI\u201917, pp 3133\u20133139. AAAI Press. http:\/\/dl.acm.org\/citation.cfm?id=3172077.3172326","DOI":"10.24963\/ijcai.2017\/437"},{"key":"6959_CR42","doi-asserted-by":"crossref","unstructured":"Xie L, Shen J, Zhu L (2016) Online cross-modal hashing for web image retrieval. In: Proceedings of the thirtieth AAAI conference on artificial intelligence, AAAI\u201916, pp 294\u2013300. AAAI Press. http:\/\/dl.acm.org\/citation.cfm?id=3015812.3015855","DOI":"10.1609\/aaai.v30i1.9982"},{"key":"6959_CR43","doi-asserted-by":"publisher","unstructured":"Xu S, Feng B, Ding P, Xu B (2012) Graph-based multi-modal scene detection for movie and teleplay. In: 2012 IEEE International Conference On Acoustics, Speech and Signal Processing (ICASSP), pp 1413\u20131416. https:\/\/doi.org\/10.1109\/ICASSP.2012.6288155","DOI":"10.1109\/ICASSP.2012.6288155"},{"key":"6959_CR44","doi-asserted-by":"publisher","unstructured":"Xu S, Feng B, Xu B (2013) Temporal video segmentation to scene based on conditional random fileds. In: Li S, El Saddik A, Wang M, Mei T, Sebe N, Yan S, Hong R, Gurrin C (eds) 2013 Proceedings of the 19th international conference on advances in multimedia modeling, MMM 2013, Huangshan, China, January 7-9, Part II, pp 374\u2013384. Springer, Berlin. https:\/\/doi.org\/10.1007\/978-3-642-35728-2_36","DOI":"10.1007\/978-3-642-35728-2_36"},{"key":"6959_CR45","doi-asserted-by":"publisher","unstructured":"Yeung M, Yeo BL, Liu B (1998) Segmentation of video by clustering and graph analysis. Comput. Vis. Image Underst 71(1):94\u2013109. https:\/\/doi.org\/10.1006\/cviu.1997.0628","DOI":"10.1006\/cviu.1997.0628"},{"key":"6959_CR46","unstructured":"Yu SX, Shi J (2001) Grouping with bias. In: Proceedings of the 14th international conference on neural information processing systems: natural and synthetic, NIPS\u201901, pp 1327\u20131334. http:\/\/dl.acm.org\/citation.cfm?id=2980539.2980711 . MIT Press, Cambridge"},{"issue":"11","key":"6959_CR47","doi-asserted-by":"publisher","first-page":"3941","DOI":"10.1109\/TCYB.2016.2591068","volume":"47","author":"L Zhu","year":"2017","unstructured":"Zhu L, Shen J, Xie L, Cheng Z (2017) Unsupervised topic hypergraph hashing for efficient mobile image retrieval. IEEE Trans Cybern 47(11):3941\u20133954. https:\/\/doi.org\/10.1109\/TCYB.2016.2591068","journal-title":"IEEE Trans Cybern"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-018-6959-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11042-018-6959-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-018-6959-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,4]],"date-time":"2026-04-04T10:04:17Z","timestamp":1775297057000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11042-018-6959-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,12,7]]},"references-count":47,"journal-issue":{"issue":"11","published-print":{"date-parts":[[2019,6]]}},"alternative-id":["6959"],"URL":"https:\/\/doi.org\/10.1007\/s11042-018-6959-4","relation":{},"ISSN":["1380-7501","1573-7721"],"issn-type":[{"value":"1380-7501","type":"print"},{"value":"1573-7721","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018,12,7]]},"assertion":[{"value":"17 December 2017","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"1 November 2018","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 November 2018","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 December 2018","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}