{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,24]],"date-time":"2025-05-24T09:07:29Z","timestamp":1748077649951,"version":"3.37.3"},"reference-count":43,"publisher":"Springer Science and Business Media LLC","issue":"10","license":[{"start":{"date-parts":[[2018,12,20]],"date-time":"2018-12-20T00:00:00Z","timestamp":1545264000000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"published-print":{"date-parts":[[2019,5]]},"DOI":"10.1007\/s11042-018-7040-z","type":"journal-article","created":{"date-parts":[[2018,12,19]],"date-time":"2018-12-19T23:25:41Z","timestamp":1545261941000},"page":"14007-14027","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":21,"title":["M-VAD names: a dataset for video captioning with naming"],"prefix":"10.1007","volume":"78","author":[{"given":"Stefano","family":"Pini","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9640-9385","authenticated-orcid":false,"given":"Marcella","family":"Cornia","sequence":"additional","affiliation":[]},{"given":"Federico","family":"Bolelli","sequence":"additional","affiliation":[]},{"given":"Lorenzo","family":"Baraldi","sequence":"additional","affiliation":[]},{"given":"Rita","family":"Cucchiara","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2018,12,20]]},"reference":[{"key":"7040_CR1","doi-asserted-by":"crossref","unstructured":"Babenko B, Yang MH, Belongie S (2009) Visual tracking with online multiple instance learning. In: IEEE international conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2009.5206737"},{"key":"7040_CR2","doi-asserted-by":"crossref","unstructured":"Baraldi L, Grana C, Cucchiara R (2017) Hierarchical boundary-aware neural encoder for video captioning. In: IEEE international conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2017.339"},{"key":"7040_CR3","doi-asserted-by":"crossref","unstructured":"Bojanowski P, Bach F, Laptev I, Ponce J, Schmid C, Sivic J (2013) Finding actors and actions in movies. In: IEEE international conference on computer vision","DOI":"10.1109\/ICCV.2013.283"},{"key":"7040_CR4","doi-asserted-by":"crossref","unstructured":"Ding L, Yilmaz A (2010) Learning relations among movie characters: a social network perspective. In: European conference on computer vision","DOI":"10.1007\/978-3-642-15561-1_30"},{"key":"7040_CR5","doi-asserted-by":"crossref","unstructured":"Donahue J, Anne Hendricks L, Guadarrama S, Rohrbach M, Venugopalan S, Saenko K, Darrell T (2015) Long-term recurrent convolutional networks for visual recognition and description. In: IEEE international conference on computer vision and pattern recognition","DOI":"10.21236\/ADA623249"},{"key":"7040_CR6","doi-asserted-by":"crossref","unstructured":"Everingham M, Sivic J, Zisserman A (2006) Hello! my name is... Buffy\u2013automatic naming of characters in TV video. In: British machine vision conference","DOI":"10.5244\/C.20.92"},{"key":"7040_CR7","doi-asserted-by":"crossref","unstructured":"Guo Y, Zhang L, Hu Y, He X, Gao J (2016) MS-Celeb-1m: a dataset and benchmark for large-scale face recognition. In: European conference on computer vision","DOI":"10.1007\/978-3-319-46487-9_6"},{"key":"7040_CR8","doi-asserted-by":"crossref","unstructured":"Hendricks LA, Venugopalan S, Rohrbach M, Mooney R, Saenko K, Darrell T (2016) Deep compositional captioning: describing novel object categories without paired training data. In: IEEE international conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2016.8"},{"key":"7040_CR9","doi-asserted-by":"crossref","unstructured":"Jin S, Su H, Stauffer C, Learned-Miller E (2017) End-to-end face detection and cast grouping in movies using Erdos-renyi\u0301 clustering. In: IEEE international conference on computer vision","DOI":"10.1109\/ICCV.2017.564"},{"key":"7040_CR10","doi-asserted-by":"crossref","unstructured":"Karpathy A, Fei-Fei L (2015) Deep visual-semantic alignments for generating image descriptions. In: IEEE international conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"7040_CR11","doi-asserted-by":"crossref","unstructured":"Karpathy A, Toderici G, Shetty S, Leung T, Sukthankar R, Fei-Fei L (2014) Large-scale video classification with convolutional neural networks. In: IEEE international conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2014.223"},{"key":"7040_CR12","unstructured":"Kiros R, Salakhutdinov R, Zemel R (2014) Unifying visual-semantic embeddings with multimodal neural language models. arXiv:\n                    1411.2539"},{"key":"7040_CR13","doi-asserted-by":"crossref","unstructured":"Krishna R, Hata K, Ren F, Fei-Fei L, Niebles JC (2017) Dense-captioning events in videos. In: IEEE international conference on computer vision","DOI":"10.1109\/ICCV.2017.83"},{"issue":"1-2","key":"7040_CR14","doi-asserted-by":"publisher","first-page":"83","DOI":"10.1002\/nav.3800020109","volume":"2","author":"HW Kuhn","year":"1955","unstructured":"Kuhn HW (1955) The hungarian method for the assignment problem. Naval Research Logistics Quarterly 2(1-2):83\u201397","journal-title":"Naval Research Logistics Quarterly"},{"key":"7040_CR15","first-page":"2579","volume":"9","author":"LVD Maaten","year":"2008","unstructured":"Maaten LVD, Hinton G (2008) Visualizing data using t-SNE. J Mach Learn Res 9:2579\u20132605","journal-title":"J Mach Learn Res"},{"issue":"3","key":"7040_CR16","doi-asserted-by":"publisher","first-page":"282","DOI":"10.1007\/s11263-013-0655-7","volume":"106","author":"MJ Mar\u00edn-Jim\u00e9nez","year":"2014","unstructured":"Mar\u00edn-Jim\u00e9nez MJ, Zisserman A, Eichner M, Ferrari V (2014) Detecting people looking at each other in videos. Int J Comput Vis 106(3):282\u2013296","journal-title":"Int J Comput Vis"},{"key":"7040_CR17","doi-asserted-by":"crossref","unstructured":"Miech A, Alayrac JB, Bojanowski P, Laptev I, Sivic J (2017) Learning from video and text via large-scale discriminative clustering. In: IEEE international conference on computer vision","DOI":"10.1109\/ICCV.2017.562"},{"key":"7040_CR18","doi-asserted-by":"crossref","unstructured":"Pan P, Xu Z, Yang Y, Wu F, Zhuang Y (2016) Hierarchical recurrent neural encoder for video representation with application to captioning. In: IEEE international conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2016.117"},{"key":"7040_CR19","doi-asserted-by":"crossref","unstructured":"Pan Y, Mei T, Yao T, Li H, Rui Y (2016) Jointly modeling embedding and translation to bridge video and language. In: IEEE international conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2016.497"},{"issue":"12","key":"7040_CR20","doi-asserted-by":"publisher","first-page":"2441","DOI":"10.1109\/TPAMI.2012.24","volume":"34","author":"A Patron-Perez","year":"2012","unstructured":"Patron-Perez A, Marszalek M, Reid I, Zisserman A (2012) Structured learning of human interactions in TV shows. IEEE Trans Pattern Anal Mach Intell 34 (12):2441\u20132453","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"7040_CR21","doi-asserted-by":"crossref","unstructured":"Pennington J, Socher R, Manning CD (2014) Glove: global vectors for word representation. In: Conference on empirical methods in natural language processing","DOI":"10.3115\/v1\/D14-1162"},{"key":"7040_CR22","doi-asserted-by":"crossref","unstructured":"Pini S, Cornia M, Baraldi L, Cucchiara R (2017) Towards video captioning with naming: a novel dataset and a multi-modal approach. In: International conference on image analysis and processing","DOI":"10.1007\/978-3-319-68548-9_36"},{"key":"7040_CR23","doi-asserted-by":"crossref","unstructured":"Ramanathan V, Joulin A, Liang P, Fei-Fei L (2014) Linking people in videos with \u201ctheir\u201d names using coreference resolution. In: European conference on computer vision","DOI":"10.1007\/978-3-319-10590-1_7"},{"key":"7040_CR24","doi-asserted-by":"crossref","unstructured":"Rohrbach A, Rohrbach M, Schiele B (2015) The long-short story of movie description. In: German conference on pattern recognition","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"7040_CR25","doi-asserted-by":"crossref","unstructured":"Rohrbach A, Rohrbach M, Tandon N, Schiele B (2015) A dataset for movie description. In: IEEE international conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"7040_CR26","doi-asserted-by":"crossref","unstructured":"Rohrbach A, Rohrbach M, Tang S, Oh SJ, Schiele B (2017) Generating descriptions with grounded and co-referenced people. In: IEEE international conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2017.447"},{"key":"7040_CR27","doi-asserted-by":"crossref","unstructured":"Schroff F, Kalenichenko D, Philbin J (2015) Facenet: a unified embedding for face recognition and clustering. In: IEEE international conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2015.7298682"},{"key":"7040_CR28","doi-asserted-by":"crossref","unstructured":"Shetty R, Rohrbach M, Hendricks LA, Fritz M, Schiele B (2017) Speaking the same language: matching machine to human captions by adversarial training. In: IEEE international conference on computer vision","DOI":"10.1109\/ICCV.2017.445"},{"key":"7040_CR29","doi-asserted-by":"crossref","unstructured":"Sivic J, Everingham M, Zisserman A (2009) Who are you? Learning person specific classifiers from video. In: IEEE international conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2009.5206513"},{"key":"7040_CR30","doi-asserted-by":"crossref","unstructured":"Socher R, Karpathy A, Le QV, Manning CD, Ng AY (2014) Grounded compositional semantics for finding and describing images with sentences. Transactions of the Association of Computational Linguistics 2(1):207\u2013218","DOI":"10.1162\/tacl_a_00177"},{"key":"7040_CR31","doi-asserted-by":"crossref","unstructured":"Tapaswi M, B\u00e4uml M, Stiefelhagen R (2012) Knock! Knock! Who is it? probabilistic person identification in TV-series. In: IEEE international conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2012.6247986"},{"key":"7040_CR32","unstructured":"Torabi A, Pal C, Larochelle H, Courville A (2015)"},{"key":"7040_CR33","doi-asserted-by":"crossref","unstructured":"Tran D, Bourdev L, Fergus R, Torresani L, Paluri M (2015) Learning spatiotemporal features with 3d convolutional networks. In: IEEE international conference on computer vision","DOI":"10.1109\/ICCV.2015.510"},{"issue":"1","key":"7040_CR34","first-page":"3221","volume":"15","author":"L Der Maaten Van","year":"2014","unstructured":"Van Der Maaten L (2014) Accelerating t-SNE using tree-based algorithms. J Mach Learn Res 15(1):3221\u20133245","journal-title":"J Mach Learn Res"},{"key":"7040_CR35","doi-asserted-by":"crossref","unstructured":"Venugopalan S, Xu H, Donahue J, Rohrbach M, Mooney R, Saenko K (2014) Translating videos to natural language using deep recurrent neural networks. North American Chapter of the Association for Computational Linguistics","DOI":"10.3115\/v1\/N15-1173"},{"key":"7040_CR36","doi-asserted-by":"crossref","unstructured":"Venugopalan S, Hendricks LA, Mooney R, Saenko K (2016) Improving lstm-based video description with linguistic knowledge mined from text. In: Conf. on empirical methods in natural language processing","DOI":"10.18653\/v1\/D16-1204"},{"key":"7040_CR37","doi-asserted-by":"crossref","unstructured":"Venugopalan S, Rohrbach M, Donahue J, Mooney R, Darrell T, Saenko K (2015) Sequence to sequence-video to text. In: IEEE international conference on computer vision","DOI":"10.1109\/ICCV.2015.515"},{"key":"7040_CR38","doi-asserted-by":"crossref","unstructured":"Vicol P, Tapaswi M, Castrejon L, Fidler S (2018) Moviegraphs: towards understanding human-centric situations from videos. In: IEEE international conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2018.00895"},{"issue":"301","key":"7040_CR39","doi-asserted-by":"publisher","first-page":"236","DOI":"10.1080\/01621459.1963.10500845","volume":"58","author":"JHJ Ward","year":"1963","unstructured":"Ward JHJ (1963) Hierarchical grouping to optimize an objective function. J Am Stat Assoc 58(301):236\u2013244","journal-title":"J Am Stat Assoc"},{"key":"7040_CR40","doi-asserted-by":"crossref","unstructured":"Yao L, Torabi A, Cho K, Ballas N, Pal C, Larochelle H, Courville A (2015) Describing videos by exploiting temporal structure. In: IEEE international conference on computer vision","DOI":"10.1109\/ICCV.2015.512"},{"key":"7040_CR41","doi-asserted-by":"crossref","unstructured":"Yu H, Wang J, Huang Z, Yang Y, Xu W (2016) Video paragraph captioning using hierarchical recurrent neural networks. In: IEEE international conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2016.496"},{"issue":"10","key":"7040_CR42","doi-asserted-by":"publisher","first-page":"1499","DOI":"10.1109\/LSP.2016.2603342","volume":"23","author":"K Zhang","year":"2016","unstructured":"Zhang K, Zhang Z, Li Z, Qiao Y (2016) Joint face detection and alignment using multitask cascaded convolutional networks. IEEE Signal Process Lett 23(10):1499\u20131503","journal-title":"IEEE Signal Process Lett"},{"key":"7040_CR43","doi-asserted-by":"crossref","unstructured":"Zhu Y, Kiros R, Zemel R, Salakhutdinov R, Urtasun R, Torralba A, Fidler S (2015) Aligning books and movies: towards story-like visual explanations by watching movies and reading books. In: IEEE international conference on computer vision and pattern recognition","DOI":"10.1109\/ICCV.2015.11"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-018-7040-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11042-018-7040-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-018-7040-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,12,19]],"date-time":"2019-12-19T19:10:19Z","timestamp":1576782619000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11042-018-7040-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,12,20]]},"references-count":43,"journal-issue":{"issue":"10","published-print":{"date-parts":[[2019,5]]}},"alternative-id":["7040"],"URL":"https:\/\/doi.org\/10.1007\/s11042-018-7040-z","relation":{},"ISSN":["1380-7501","1573-7721"],"issn-type":[{"type":"print","value":"1380-7501"},{"type":"electronic","value":"1573-7721"}],"subject":[],"published":{"date-parts":[[2018,12,20]]},"assertion":[{"value":"1 February 2018","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"1 November 2018","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 December 2018","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 December 2018","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}