{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,4]],"date-time":"2025-11-04T10:54:58Z","timestamp":1762253698958,"version":"3.37.3"},"reference-count":69,"publisher":"Springer Science and Business Media LLC","issue":"12","license":[{"start":{"date-parts":[[2021,2,8]],"date-time":"2021-02-08T00:00:00Z","timestamp":1612742400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,2,8]],"date-time":"2021-02-08T00:00:00Z","timestamp":1612742400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"published-print":{"date-parts":[[2021,5]]},"DOI":"10.1007\/s11042-020-10450-2","type":"journal-article","created":{"date-parts":[[2021,2,9]],"date-time":"2021-02-09T08:08:10Z","timestamp":1612858090000},"page":"17487-17513","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":15,"title":["Temporal video scene segmentation using deep-learning"],"prefix":"10.1007","volume":"80","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1826-4456","authenticated-orcid":false,"given":"Tiago Henrique","family":"Trojahn","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rudinei","family":"Goularte","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2021,2,8]]},"reference":[{"doi-asserted-by":"publisher","unstructured":"Arthur D, Arthur D, Vassilvitskii S, Vassilvitskii S (2007) k-means++: the advantages of careful seeding. In: Proceedings of the 18th annual ACM-SIAM Symposium on Discrete Algorithms. https:\/\/doi.org\/10.1145\/1283383.1283494. https:\/\/dl.acm.org\/citation.cfm?id=1283494, vol 8, pp 1027\u20131035","key":"10450_CR1","DOI":"10.1145\/1283383.1283494"},{"issue":"6","key":"10450_CR2","doi-asserted-by":"publisher","first-page":"345","DOI":"10.1007\/s00530-010-0182-0","volume":"16","author":"PK Atrey","year":"2010","unstructured":"Atrey P K, Hossain M A, El Saddik A, Kankanhalli M S (2010) Multimodal fusion for multimedia analysis: a survey. Multimed Syst 16(6):345\u2013379. https:\/\/doi.org\/10.1007\/s00530-010-0182-0","journal-title":"Multimed Syst"},{"issue":"1","key":"10450_CR3","doi-asserted-by":"publisher","first-page":"69","DOI":"10.1007\/s11042-009-0351-3","volume":"48","author":"L Ballan","year":"2010","unstructured":"Ballan L, Bertini M, Del Bimbo A, Serra G (2010) Video event classification using string kernels. Multimed Tools Appl 48(1):69\u201387. https:\/\/doi.org\/10.1007\/s11042-009-0351-3. http:\/\/link.springer.com\/10.1007\/s11042-009-0351-3","journal-title":"Multimed Tools Appl"},{"doi-asserted-by":"publisher","unstructured":"Baraldi L, Grana C, Cucchiara R (2015a) A deep siamese network for scene detection in broadcast videos. In: Proceedings of the 23rd ACM international conference on multimedia, MM \u201915. https:\/\/doi.org\/10.1145\/2733373.2806316. https:\/\/dl.acm.org\/citation.cfm?doid=2733373.2806316. ACM, New York, pp 1199\u20131202","key":"10450_CR4","DOI":"10.1145\/2733373.2806316"},{"doi-asserted-by":"publisher","unstructured":"Baraldi L, Grana C, Cucchiara R (2015b) Measuring scene detection performance. In: Paredes R, Cardoso JS, Pardo XM (eds) Pattern recognition and image analysis, lecture notes in computer science. https:\/\/doi.org\/10.1007\/978-3-319-19390-8_45. http:\/\/link.springer.com\/10.1007\/978-3-319-19390-8_45, vol 9117. Springer International Publishing, Cham, pp 395\u2013403","key":"10450_CR5","DOI":"10.1007\/978-3-319-19390-8_45"},{"key":"10450_CR6","doi-asserted-by":"publisher","first-page":"801","DOI":"10.1007\/978-3-319-23192-1_67","volume-title":"Shot and scene detection via hierarchical clustering for re-using broadcast video","author":"L Baraldi","year":"2015","unstructured":"Baraldi L, Grana C, Cucchiara R (2015c) Shot and scene detection via hierarchical clustering for re-using broadcast video. Springer International Publishing, Cham, pp 801\u2013811. https:\/\/doi.org\/10.1007\/978-3-319-23192-1_67. https:\/\/link.springer.com\/chapter\/10.1007\/978-3-319-23192-1_67"},{"doi-asserted-by":"publisher","unstructured":"Baraldi L, Grana C, Cucchiara R (2017) Hierarchical boundary-aware neural encoder for video captioning. In: 2017 IEEE conference on computer vision and pattern recognition (CVPR). https:\/\/doi.org\/10.1109\/CVPR.2017.339, pp 3185\u20133194","key":"10450_CR7","DOI":"10.1109\/CVPR.2017.339"},{"issue":"5","key":"10450_CR8","doi-asserted-by":"publisher","first-page":"955","DOI":"10.1109\/TMM.2016.2644872","volume":"19","author":"L Baraldi","year":"2017","unstructured":"Baraldi L, Grana C, Cucchiara R (2017) Recognizing and presenting the storytelling video structure with deep multimodal networks. IEEE Trans Multimed 19(5):955\u2013968. https:\/\/doi.org\/10.1109\/TMM.2016.2644872. http:\/\/ieeexplore.ieee.org\/document\/7797131\/","journal-title":"IEEE Trans Multimed"},{"doi-asserted-by":"publisher","unstructured":"Barbieri TTS, Trojahn TH, Ponti MP Jr, Goularte R (2015) Shot-hr: a video shot representation method based on visual features. In: Proceedings of the 30th annual ACM symposium on applied computing, SAC \u201915. https:\/\/doi.org\/10.1145\/2695664.2695841. http:\/\/doi.acm.org\/10.1145\/2695664.2695841. ACM, New York, pp 1257\u20131262","key":"10450_CR9","DOI":"10.1145\/2695664.2695841"},{"key":"10450_CR10","doi-asserted-by":"publisher","first-page":"437","DOI":"10.1007\/978-3-642-35289-8_26","volume-title":"Practical recommendations for gradient-based training of deep architectures, 2nd edn","author":"Y Bengio","year":"2012","unstructured":"Bengio Y (2012) Practical recommendations for gradient-based training of deep architectures, 2nd edn. Springer, Berlin, pp 437\u2013478. https:\/\/doi.org\/10.1007\/978-3-642-35289-8_26"},{"doi-asserted-by":"crossref","unstructured":"Blanken HM, de Vries AP, Blok HE, Feng L (eds) (2007) Multimedia retrieval, 1st edn. Springer, Berlin. https:\/\/www.springer.com\/la\/book\/9783540728948","key":"10450_CR11","DOI":"10.1007\/978-3-540-72895-5_1"},{"issue":"2","key":"10450_CR12","doi-asserted-by":"publisher","first-page":"233","DOI":"10.1147\/rd.422.0233","volume":"42","author":"RM Bolle","year":"1998","unstructured":"Bolle R M, Yeo B L, Yeung M M (1998) Video query: research directions. IBM J Res Dev 42(2):233\u2013252. https:\/\/doi.org\/10.1147\/rd.422.0233. https:\/\/ieeexplore.ieee.org\/document\/5389317\/","journal-title":"IBM J Res Dev"},{"issue":"1","key":"10450_CR13","doi-asserted-by":"publisher","first-page":"48","DOI":"10.1016\/j.cviu.2008.07.003","volume":"113","author":"GJ Burghouts","year":"2009","unstructured":"Burghouts G J, Geusebroek J M (2009) Performance evaluation of local colour invariants. Comput Vis Image Underst 113(1):48\u201362. https:\/\/doi.org\/10.1016\/j.cviu.2008.07.003","journal-title":"Comput Vis Image Underst"},{"key":"10450_CR14","doi-asserted-by":"publisher","first-page":"190","DOI":"10.1016\/j.eswa.2019.04.031","volume":"131","author":"L Caruccio","year":"2019","unstructured":"Caruccio L, Polese G, Tortora G, Iannone D (2019) Edcar: a knowledge representation framework to enhance automatic video surveillance. Expert Syst Appl 131:190\u2013207. https:\/\/doi.org\/10.1016\/j.eswa.2019.04.031. http:\/\/www.sciencedirect.com\/science\/article\/pii\/S0957417419302623","journal-title":"Expert Syst Appl"},{"doi-asserted-by":"crossref","unstructured":"Chasanis V, Kalogeratos A, Likas A (2009a) Movie segmentation into scenes and chapters using locally weighted bag of visual words. In: Proceedings of the ACM international conference on image and video retrieval, CIVR \u201909. http:\/\/doi.acm.org\/10.1145\/1646396.1646439. ACM, New York, pp 35:1\u201335:7","key":"10450_CR15","DOI":"10.1145\/1646396.1646439"},{"issue":"1","key":"10450_CR16","doi-asserted-by":"publisher","first-page":"89","DOI":"10.1109\/TMM.2008.2008924","volume":"11","author":"V Chasanis","year":"2009","unstructured":"Chasanis V, Likas A, Galatsanos N (2009b) Scene detection in videos using shot clustering and sequence alignment. IEEE Trans Multimed 11(1):89\u2013100. https:\/\/doi.org\/10.1109\/TMM.2008.2008924. http:\/\/ieeexplore.ieee.org\/document\/4721597\/","journal-title":"IEEE Trans Multimed"},{"doi-asserted-by":"publisher","unstructured":"Chen L, Ozsu M (2002) Rule-based scene extraction from video. In: Proceedings of the international conference on image processing. https:\/\/doi.org\/10.1109\/ICIP.2002.1040056. http:\/\/ieeexplore.ieee.org\/document\/1040056\/, vol 2. IEEE, Washington, DC, pp 737\u2013740","key":"10450_CR17","DOI":"10.1109\/ICIP.2002.1040056"},{"doi-asserted-by":"crossref","unstructured":"Cho K, van Merri\u00ebnboer B, G\u00fcl\u00e7ehre \u00c7, Bahdanau D, Bougares F, Schwenk H, Bengio Y (2014) Learning phrase representations using RNN encoder\u2013decoder for statistical machine translation. In: Proceedings of the conference on empirical methods in natural language processing. http:\/\/www.aclweb.org\/anthology\/D14-1179. Association for Computational Linguistics, pp 1724\u20131734","key":"10450_CR18","DOI":"10.3115\/v1\/D14-1179"},{"doi-asserted-by":"publisher","unstructured":"Coimbra DB, Goularte R (2009) Digital video scenes identification using audiovisual features. In: Proceedings of the XV Brazilian symposium on multimedia and the web\u2014WebMedia \u201909. https:\/\/doi.org\/10.1145\/1858477.1858520. http:\/\/doi.acm.org\/10.1145\/1858477.1858520. ACM Press, New York, pp 1\u20134","key":"10450_CR19","DOI":"10.1145\/1858477.1858520"},{"doi-asserted-by":"publisher","unstructured":"de Souza TT, Goularte R (2013) Video shot representation based on histograms. In: Proceedings of the 28th annual ACM symposium on applied computing, SAC \u201913. https:\/\/doi.org\/10.1145\/2480362.2480547. http:\/\/doi.acm.org\/10.1145\/2480362.2480547. ACM, New York, pp 961\u2013966","key":"10450_CR20","DOI":"10.1145\/2480362.2480547"},{"issue":"5","key":"10450_CR21","doi-asserted-by":"publisher","first-page":"427","DOI":"10.1007\/s00530-013-0306-4","volume":"19","author":"M Del Fabro","year":"2013","unstructured":"Del Fabro M, B\u00f6sz\u00f6rmenyi L (2013) State-of-the-art and future challenges in video scene detection: a survey. Multimed Syst 19(5):427\u2013454. https:\/\/doi.org\/10.1007\/s00530-013-0306-4. https:\/\/ieeexplore.ieee.org\/document\/5972529\/","journal-title":"Multimed Syst"},{"unstructured":"Goodfellow I, Bengio Y, Courville A (2016) Deep learning. MIT Press. http:\/\/www.deeplearningbook.org","key":"10450_CR22"},{"doi-asserted-by":"publisher","unstructured":"Gupta A, Gupta H (2013) Applications of mfcc and vector quantization in speaker recognition. In: International conference on intelligent systems and signal processing, ISSP\u2019. https:\/\/doi.org\/10.1109\/ISSP.2013.6526896. https:\/\/ieeexplore.ieee.org\/document\/6526896\/, vol 13, pp 170\u2013173","key":"10450_CR23","DOI":"10.1109\/ISSP.2013.6526896"},{"doi-asserted-by":"publisher","unstructured":"Gygli M (2018) Ridiculously fast shot boundary detection with fully convolutional neural networks. In: 2018 International conference on content-based multimedia indexing (CBMI). https:\/\/doi.org\/10.1109\/CBMI.2018.8516556, pp 1\u20134","key":"10450_CR24","DOI":"10.1109\/CBMI.2018.8516556"},{"doi-asserted-by":"publisher","unstructured":"Han B, Wu W (2011) Video scene segmentation using a novel boundary evaluation criterion and dynamic programming. In: IEEE international conference on multimedia and expo. https:\/\/doi.org\/10.1109\/ICME.2011.6012001. https:\/\/ieeexplore.ieee.org\/document\/6012001\/, pp 1\u20136","key":"10450_CR25","DOI":"10.1109\/ICME.2011.6012001"},{"issue":"4","key":"10450_CR26","doi-asserted-by":"publisher","first-page":"580","DOI":"10.1109\/76.767124","volume":"9","author":"A Hanjalic","year":"1999","unstructured":"Hanjalic A, Lagendijk R L, Biemond J (1999) Automated high-level movie segmentation for advanced video-retrieval systems. IEEE Trans Circ Syst Video Technol 9(4):580\u2013588. https:\/\/doi.org\/10.1109\/76.767124. https:\/\/ieeexplore.ieee.org\/document\/767124\/","journal-title":"IEEE Trans Circ Syst Video Technol"},{"doi-asserted-by":"crossref","unstructured":"Hare JS, Samangooei S, Dupplaw DP (2011) OpenIMAJ and ImageTerrier: Java libraries and tools for scalable multimedia analysis and indexing of images. In: Proceedings of the 19th ACM international conference on multimedia, MM \u201911. doi:10.1145\/2072298.2072421. http:\/\/doi.acm.org\/10.1145\/2072298.2072421. ACM, New York, pp 691\u2013694","key":"10450_CR27","DOI":"10.1145\/2072298.2072421"},{"unstructured":"Hassanien A, Elgharib MA, Selim A, Hefeeda M, Matusik W (2017) Large-scale, fast and accurate shot boundary detection through spatio-temporal convolutional neural networks. CoRR arXiv:1705.03281","key":"10450_CR28"},{"doi-asserted-by":"publisher","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: IEEE conference on computer vision and pattern recognition. https:\/\/doi.org\/10.1109\/CVPR.2016.90. https:\/\/ieeexplore.ieee.org\/document\/7780459\/. IEEE, pp 770\u2013778","key":"10450_CR29","DOI":"10.1109\/CVPR.2016.90"},{"issue":"8","key":"10450_CR30","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter S, Schmidhuber J (1997) Long short-term memory. Neural Comput 9(8):1735\u20131780. https:\/\/doi.org\/10.1162\/neco.1997.9.8.1735","journal-title":"Neural Comput"},{"doi-asserted-by":"publisher","unstructured":"Jacobs CE, Finkelstein A, Salesin DH (1995) Fast multiresolution image querying. In: Proceedings of the 22nd annual conference on computer graphics and interactive techniques, SIGGRAPH \u201995. https:\/\/doi.org\/10.1145\/218380.218454. http:\/\/doi.acm.org\/10.1145\/218380.218454. ACM, New York, pp 277\u2013286","key":"10450_CR31","DOI":"10.1145\/218380.218454"},{"unstructured":"Jones E, Oliphant T, Peterson P et al (2001) SciPy: open source scientific tools for Python. http:\/\/www.scipy.org\/","key":"10450_CR32"},{"unstructured":"Jozefowicz R, Zaremba W, Sutskever I (2015) An empirical exploration of recurrent network architectures. In: Proceedings of the 32nd international conference on international conference on machine learning, JMLR.org. https:\/\/dl.acm.org\/citation.cfm?id=3045367, pp 2342\u20132350","key":"10450_CR33"},{"issue":"11","key":"10450_CR34","doi-asserted-by":"publisher","first-page":"15623","DOI":"10.1007\/s11042-018-6959-4","volume":"78","author":"RM Kishi","year":"2019","unstructured":"Kishi R M, Trojahn T H, Goularte R (2019) Correlation based feature fusion for the temporal video scene segmentation task. Multimed Tools Appl 78 (11):15623\u201315646. https:\/\/doi.org\/10.1007\/s11042-018-6959-4","journal-title":"Multimed Tools Appl"},{"unstructured":"Krizhevsky A, Sutskever I, Hinton GE (2012) ImageNet classification with deep convolutional neural networks. In: Proceedings of the 25th international conference on neural information processing system, NIPS\u201912. http:\/\/dl.acm.org\/citation.cfm?id=2999134.2999257, vol 1. Curran Associates Inc., pp 1097\u20131105","key":"10450_CR35"},{"doi-asserted-by":"crossref","unstructured":"Kumar N, Sukavanam N (2019) Keyframes and shot boundaries: the attributes of scene segmentation and classification. In: Yadav N, Yadav A, Bansal J C, Deep K, Kim J H (eds) Harmony search and nature inspired optimization algorithms. Springer, Singapore, pp 771\u2013782","key":"10450_CR36","DOI":"10.1007\/978-981-13-0761-4_74"},{"unstructured":"Lipton Z C, Berkowitz J, Elkan C (2015) A critical review of recurrent neural networks for sequence learning. Computing Research Repository arXiv 2015. arXiv:1506.00019","key":"10450_CR37"},{"doi-asserted-by":"publisher","unstructured":"Liu Z, Wang Y (2018) Tv news story segmentation using deep neural network. In: IEEE international conference on multimedia expo workshops (ICMEW). https:\/\/doi.org\/10.1109\/ICMEW.2018.8551568, pp 1\u20134","key":"10450_CR38","DOI":"10.1109\/ICMEW.2018.8551568"},{"issue":"2","key":"10450_CR39","first-page":"194","volume":"5","author":"BL Lopes","year":"2014","unstructured":"Lopes BL, Trojahn TH, Goularte R (2014) Video scene detection by multimodal bag of features. J Inf Data Manag 5(2):194\u2013205. https:\/\/seer.ufmg.br\/index.php\/jidm\/article\/view\/632","journal-title":"J Inf Data Manag"},{"issue":"2","key":"10450_CR40","doi-asserted-by":"publisher","first-page":"91","DOI":"10.1023\/B:VISI.0000029664.99615.94","volume":"60","author":"DG Lowe","year":"2004","unstructured":"Lowe DG (2004) Distinctive image features from scale-invariant keypoints. Int J Comput Vis 60 (2):91\u2013110. https:\/\/doi.org\/10.1023\/B:VISI.0000029664.99615.94. http:\/\/dl.acm.org\/citation.cfm?id=993451.996342","journal-title":"Int J Comput Vis"},{"issue":"10","key":"10450_CR41","doi-asserted-by":"publisher","first-page":"1615","DOI":"10.1109\/TPAMI.2005.188","volume":"27","author":"K Mikolajczyk","year":"2005","unstructured":"Mikolajczyk K, Schmid C (2005) Performance evaluation of local descriptors. IEEE Trans Pattern Anal Mach Intell 27(10):1615\u20131630. https:\/\/doi.org\/10.1109\/TPAMI.2005.188. http:\/\/dl.acm.org\/citation.cfm?id=1083822.1083989","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"unstructured":"Mikolov T, Chen K, Corrado G, Dean J (2013) Efficient estimation of word representations in vector space. arXiv:1301.3781","key":"10450_CR42"},{"doi-asserted-by":"publisher","unstructured":"Muhammad H, Junaid B, Ihsan U, Sher MD, Maheen B, Varsha D (2018) Video scene detection using compact bag of visual word models. Advances in Multimedia 2018. https:\/\/doi.org\/10.1155\/2018\/2564963","key":"10450_CR43","DOI":"10.1155\/2018\/2564963"},{"issue":"3","key":"10450_CR44","doi-asserted-by":"publisher","first-page":"443","DOI":"10.1016\/0022-2836(70)90057-4","volume":"48","author":"SB Needleman","year":"1970","unstructured":"Needleman S B, Wunsch C D (1970) A general method applicable to the search for similarities in the amino acid sequence of two proteins. J Molec Biol 48 (3):443\u2013453. https:\/\/doi.org\/10.1016\/0022-2836(70)90057-4. https:\/\/www.sciencedirect.com\/science\/article\/pii\/0022283670900574","journal-title":"J Molec Biol"},{"unstructured":"Ng AY, Jordan MI, Weiss Y (2001) On spectral clustering: analysis and an algorithm. In: Proceedings of the 14th international conference on neural information processing systems: natural and synthetic, NIPS \u201901. http:\/\/dl.acm.org\/citation.cfm?id=2980539.2980649. MIT Press, Cambridge, pp 849\u2013856","key":"10450_CR45"},{"issue":"5","key":"10450_CR46","doi-asserted-by":"publisher","first-page":"991","DOI":"10.1007\/s11760-018-1244-6","volume":"12","author":"S Protasov","year":"2018","unstructured":"Protasov S, Khan A M, Sozykin K, Ahmad M (2018) Using deep features for video scene detection and annotation. Signal Image Video Process 12 (5):991\u2013999. https:\/\/doi.org\/10.1007\/s11760-018-1244-6","journal-title":"Signal Image Video Process"},{"doi-asserted-by":"crossref","unstructured":"Qin Z, Liu W, Wan T (2013) A bag-of-tones model with mfcc features for musical genre classification. In: Motoda H, Wu Z, Cao L, Zaiane O, Yao M, Wang W (eds) Advanced data mining and applications. https:\/\/link.springer.com\/chapter\/10.1007\/978-3-642-53914-5_48. Springer, Berlin, pp 564\u2013575","key":"10450_CR47","DOI":"10.1007\/978-3-642-53914-5_48"},{"doi-asserted-by":"publisher","unstructured":"Rasheed Z, Shah M (2003) Scene detection in Hollywood movies and TV shows. In: IEEE conference on computer vision and pattern recognition. https:\/\/doi.org\/10.1109\/CVPR.2003.1211489. http:\/\/ieeexplore.ieee.org\/document\/1211489, vol 2. IEEE Computer Society, Vancouver, pp 343\u2013348","key":"10450_CR48","DOI":"10.1109\/CVPR.2003.1211489"},{"issue":"6","key":"10450_CR49","doi-asserted-by":"publisher","first-page":"1097","DOI":"10.1109\/TMM.2005.858392","volume":"7","author":"Z Rasheed","year":"2005","unstructured":"Rasheed Z, Shah M (2005) Detection and representation of scenes in videos. IEEE Trans Multimed 7(6):1097\u20131105. https:\/\/ieeexplore.ieee.org\/document\/1542086\/","journal-title":"IEEE Trans Multimed"},{"doi-asserted-by":"publisher","unstructured":"Razavian AS, Azizpour H, Sullivan J, Carlsson S (2014) CNN features off-the-shelf: an astounding baseline for recognition. In: IEEE conference on computer vision and pattern recognition workshops. https:\/\/doi.org\/10.1109\/CVPRW.2014.131. http:\/\/ieeexplore.ieee.org\/document\/6910029\/, 1403.6382. IEEE, pp 512\u2013519","key":"10450_CR50","DOI":"10.1109\/CVPRW.2014.131"},{"key":"10450_CR51","volume-title":"Information retrieval, 2nd edn","author":"CG Rijsbergen","year":"1979","unstructured":"Rijsbergen C G (1979) Information retrieval, 2nd edn. Butterworths, London"},{"doi-asserted-by":"crossref","unstructured":"Rotman D, Porat D, Ashour G (2016) Robust and efficient video scene detection using optimal sequential grouping. In: 2016 IEEE international symposium on multimedia (ISM), pp 275\u2013280","key":"10450_CR52","DOI":"10.1109\/ISM.2016.0061"},{"doi-asserted-by":"crossref","unstructured":"Rotman D, Porat D, Ashour G (2017) Robust video scene detection using multimodal fusion of optimally grouped features. In: 2017 IEEE 19th international workshop on multimedia signal processing (MMSP), pp 1\u20136","key":"10450_CR53","DOI":"10.1109\/MMSP.2017.8122267"},{"doi-asserted-by":"publisher","unstructured":"Rotman D, Porat D, Ashour G, Barzelay U (2018) Optimally grouped deep features using normalized cost for video scene detection. In: Proceedings of the 2018 ACM on international conference on multimedia retrieval, ICMR \u201918. https:\/\/doi.org\/10.1145\/3206025.3206055. http:\/\/doi.acm.org\/10.1145\/3206025.3206055. ACM, New York, pp 187\u2013195","key":"10450_CR54","DOI":"10.1145\/3206025.3206055"},{"doi-asserted-by":"publisher","unstructured":"Schroff F, Kalenichenko D, Philbin J (2015) Facenet: a unified embedding for face recognition and clustering. In: IEEE conference on computer vision and pattern recognition. https:\/\/doi.org\/10.1109\/CVPR.2015.7298682. https:\/\/ieeexplore.ieee.org\/document\/7298682\/, vol 2015, pp 815\u2013823","key":"10450_CR55","DOI":"10.1109\/CVPR.2015.7298682"},{"doi-asserted-by":"publisher","unstructured":"Sidiropoulos P, Mezaris V, Kompatsiaris I, Meinedo H, Trancoso I (2009) Multi-modal scene segmentation using scene transition graphs. In: Proceedings of the seventeen ACM international conference on multimedia. https:\/\/doi.org\/10.1145\/1631272.1631383. http:\/\/dl.acm.org\/citation.cfm?id=1631272.1631383. ACM Press, New York, pp 665\u2013668","key":"10450_CR56","DOI":"10.1145\/1631272.1631383"},{"issue":"8","key":"10450_CR57","doi-asserted-by":"publisher","first-page":"1163","DOI":"10.1109\/TCSVT.2011.2138830","volume":"21","author":"P Sidiropoulos","year":"2011","unstructured":"Sidiropoulos P, Mezaris V, Kompatsiaris I, Meinedo H, Bugalho M, Trancoso I (2011) Temporal video segmentation to scenes using high-level audiovisual features. IEEE Trans Circ Syst Video Technol 21(8):1163\u20131177. https:\/\/doi.org\/10.1109\/TCSVT.2011.2138830. https:\/\/ieeexplore.ieee.org\/document\/5742987\/","journal-title":"IEEE Trans Circ Syst Video Technol"},{"issue":"6","key":"10450_CR58","doi-asserted-by":"publisher","first-page":"904","DOI":"10.1109\/TCSVT.2011.2181231","volume":"22","author":"P Sidiropoulos","year":"2012","unstructured":"Sidiropoulos P, Mezaris V, Kompatsiaris I, Kittler J (2012) Differential edit distance: a metric for scene segmentation evaluation. IEEE Trans Circ Syst Video Technol 22(6):904\u2013914. https:\/\/doi.org\/10.1109\/TCSVT.2011.2181231. https:\/\/ieeexplore.ieee.org\/document\/6111460\/","journal-title":"IEEE Trans Circ Syst Video Technol"},{"unstructured":"Simonyan K, Zisserman A (2014) Very deep convolutional networks for large-scale image recognition. arXiv:1409.1556","key":"10450_CR59"},{"issue":"12","key":"10450_CR60","doi-asserted-by":"publisher","first-page":"1349","DOI":"10.1109\/34.895972","volume":"22","author":"A Smeulders","year":"2000","unstructured":"Smeulders A, Worring M, Santini S, Gupta A, Jain R (2000) Content-based image retrieval at the end of the early years. IEEE Trans Pattern Anal Mach Intell 22(12):1349\u20131380. https:\/\/doi.org\/10.1109\/34.895972. http:\/\/dl.acm.org\/citation.cfm?id=357871.357873","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"issue":"2","key":"10450_CR61","first-page":"181","volume":"5","author":"R Sperandio","year":"2014","unstructured":"Sperandio R, Patroc\u00ednio ZKG Jr, Paula HB, Guimar\u00e3es SJF (2014) Exploring strategies for minimizing overlap between nodes in a multimodal metric tree. J Inf 5(2):181\u2013193. https:\/\/seer.lcc.ufmg.br\/index.php\/jidm\/article\/view\/633","journal-title":"J Inf"},{"doi-asserted-by":"publisher","unstructured":"Trojahn TH, Goularte R (2013) Video scene segmentation by improved visual shot coherence. In: Proceedings of the 19th Brazilian symposium on multimedia and the web, WebMedia \u201913. https:\/\/doi.org\/10.1145\/2526188.2526206. http:\/\/doi.acm.org\/10.1145\/2526188.2526206. ACM, New York, pp 23\u201330","key":"10450_CR62","DOI":"10.1145\/2526188.2526206"},{"doi-asserted-by":"publisher","unstructured":"Trojahn TH, Kishi RM, Goularte R (2018) A new multimodal deep-learning model to video scene segmentation. In: Proceedings of the 24th Brazilian symposium on multimedia and the web, WebMedia \u201918. https:\/\/doi.org\/10.1145\/3243082.3243108. ACM, New York, pp 205\u2013212","key":"10450_CR63","DOI":"10.1145\/3243082.3243108"},{"issue":"4","key":"10450_CR64","doi-asserted-by":"publisher","first-page":"492","DOI":"10.1109\/TMM.2002.802021","volume":"4","author":"J Vendrig","year":"2002","unstructured":"Vendrig J, Worring M (2002) Systematic evaluation of logical story unit segmentation. IEEE Trans Multimed 4(4):492\u2013499. https:\/\/doi.org\/10.1109\/TMM.2002.802021. http:\/\/ieeexplore.ieee.org\/document\/1176947\/","journal-title":"IEEE Trans Multimed"},{"issue":"3","key":"10450_CR65","doi-asserted-by":"publisher","first-page":"1845","DOI":"10.1109\/TIT.2017.2776228","volume":"64","author":"T Wiatowski","year":"2018","unstructured":"Wiatowski T, Bolcskei H (2018) A mathematical theory of deep convolutional neural networks for feature extraction. IEEE Trans Inf Theory 64 (3):1845\u20131866. https:\/\/doi.org\/10.1109\/TIT.2017.2776228. http:\/\/ieeexplore.ieee.org\/document\/8116648\/","journal-title":"IEEE Trans Inf Theory"},{"doi-asserted-by":"publisher","unstructured":"Xu J, Song L, Xie R (2016) Shot boundary detection using convolutional neural networks. In: 2016 Visual communications and image processing (VCIP). https:\/\/doi.org\/10.1109\/VCIP.2016.7805554, pp 1\u20134","key":"10450_CR66","DOI":"10.1109\/VCIP.2016.7805554"},{"doi-asserted-by":"publisher","unstructured":"Yeung M M, Yeo B L, Wolf W H, Liu B (1995) Video browsing using clustering and scene transitions on compressed sequences. In: Rodriguez A A, Maitan J (eds). https:\/\/doi.org\/10.1117\/12.206067. http:\/\/proceedings.spiedigitallibrary.org\/proceeding.aspx?articleid=991685, vol 2417. Society of Photo-Optical Instrumentation Engineers, Multimedia computing and networking, pp 399\u2013413","key":"10450_CR67","DOI":"10.1117\/12.206067"},{"issue":"1","key":"10450_CR68","doi-asserted-by":"publisher","first-page":"94","DOI":"10.1006\/cviu.1997.0628","volume":"71","author":"M Yeung","year":"1998","unstructured":"Yeung M, Yeo B L, Liu B (1998) Segmentation of video by clustering and graph analysis. Comput Vis Image Underst 71(1):94\u2013109. https:\/\/doi.org\/10.1006\/cviu.1997.0628. http:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S1077314297906287","journal-title":"Comput Vis Image Underst"},{"doi-asserted-by":"publisher","unstructured":"Zhao B, Li X, Lu X (2018) HSA-RNN: Hierarchical Structure-Adaptive RNN for video summarization. In: 2018 IEEE conference on computer vision and pattern recognition, pp 7405\u20137414. https:\/\/doi.org\/10.1109\/CVPR.2018.00773. https:\/\/ieeexplore.ieee.org\/document\/8578871","key":"10450_CR69","DOI":"10.1109\/CVPR.2018.00773"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-020-10450-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-020-10450-2\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-020-10450-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,5,20]],"date-time":"2021-05-20T08:47:23Z","timestamp":1621500443000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-020-10450-2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,2,8]]},"references-count":69,"journal-issue":{"issue":"12","published-print":{"date-parts":[[2021,5]]}},"alternative-id":["10450"],"URL":"https:\/\/doi.org\/10.1007\/s11042-020-10450-2","relation":{},"ISSN":["1380-7501","1573-7721"],"issn-type":[{"type":"print","value":"1380-7501"},{"type":"electronic","value":"1573-7721"}],"subject":[],"published":{"date-parts":[[2021,2,8]]},"assertion":[{"value":"12 August 2019","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 September 2020","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 December 2020","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"8 February 2021","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}