{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,7]],"date-time":"2026-02-07T09:19:32Z","timestamp":1770455972109,"version":"3.49.0"},"reference-count":89,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001824","name":"Grantov\u00e1 Agentura \u010cesk\u00e9 Republiky","doi-asserted-by":"publisher","award":["19-22071Y"],"award-info":[{"award-number":["19-22071Y"]}],"id":[{"id":"10.13039\/501100001824","id-type":"DOI","asserted-by":"publisher"}]},{"name":"European Unions Horizon 2020 research and innovation programme: V4Design","award":["779962"],"award-info":[{"award-number":["779962"]}]},{"name":"MARCONI","award":["761802"],"award-info":[{"award-number":["761802"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Multimedia"],"published-print":{"date-parts":[[2021]]},"DOI":"10.1109\/tmm.2020.2980944","type":"journal-article","created":{"date-parts":[[2020,3,17]],"date-time":"2020-03-17T02:24:56Z","timestamp":1584411896000},"page":"243-256","source":"Crossref","is-referenced-by-count":64,"title":["Interactive Video Retrieval in the Age of Deep Learning \u2013 Detailed Evaluation of VBS 2019"],"prefix":"10.1109","volume":"23","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5389-9465","authenticated-orcid":false,"given":"Luca","family":"Rossetto","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3016-1396","authenticated-orcid":false,"given":"Ralph","family":"Gasser","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3558-4144","authenticated-orcid":false,"given":"Jakub","family":"Lokoc","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2442-4900","authenticated-orcid":false,"given":"Werner","family":"Bailer","sequence":"additional","affiliation":[]},{"given":"Klaus","family":"Schoeffmann","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7746-6234","authenticated-orcid":false,"given":"Bernd","family":"Muenzer","sequence":"additional","affiliation":[]},{"given":"Tomas","family":"Soucek","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1289-3785","authenticated-orcid":false,"given":"Phuong Anh","family":"Nguyen","sequence":"additional","affiliation":[]},{"given":"Paolo","family":"Bolettieri","sequence":"additional","affiliation":[]},{"given":"Andreas","family":"Leibetseder","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2505-9178","authenticated-orcid":false,"given":"Stefanos","family":"Vrochidis","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref73","article-title":"TransNet: A deep network for fast detection of common shot transitions","author":"sou?ek","year":"2019"},{"key":"ref72","article-title":"Deep learning-based concept detection in vitrivr at the video browser showdown - final notes","author":"rossetto","year":"2019"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-14442-9_24"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-51814-5_43"},{"key":"ref76","first-page":"609","article-title":"VIREO@ video browser showdown","author":"nguyen","year":"0","journal-title":"Proc Int Conf Multimedia Model"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-05716-9_23"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-51814-5_42"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1145\/2733373.2806221"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-73600-6_42"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1145\/3095713.3095740"},{"key":"ref79","article-title":"YOLOv3: An incremental improvement","author":"redmon","year":"2018"},{"key":"ref33","article-title":"The open images dataset V4: Unified image classification, object detection, and visual relationship detection at scale","author":"kuznetsova","year":"2018"},{"key":"ref32","first-page":"4278","article-title":"Inception-v4, inception-resnet and the impact of residual connections on learning","author":"szegedy","year":"0","journal-title":"Proc 31st AAAI Conf Artif Intell"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2723009"},{"key":"ref30","first-page":"740","article-title":"Microsoft COCO: Common objects in context","author":"lin","year":"0","journal-title":"Proc Eur Conf Comput Vision"},{"key":"ref37","article-title":"The kinetics human action video dataset","author":"kay","year":"2017"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.223"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.590"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2587640"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1145\/3078971.3079041"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2018.2848458"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"ref28","article-title":"VIREO@TRECVID: Instance search and semantic indexing","author":"zhang","year":"0","journal-title":"Proc NIST TRECVID Workshop"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-35063-4_24"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3351046"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2973797"},{"key":"ref29","article-title":"Creating HAVIC: Heterogeneous audio visual internet collection","author":"strassel","year":"0","journal-title":"Proc 8th Conf Int Lang Resources Eval"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1145\/3178422.3178430"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1145\/3178422.3178430"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1145\/3323873.3326921"},{"key":"ref2","doi-asserted-by":"crossref","first-page":"797","DOI":"10.1109\/TSMCC.2011.2109710","article-title":"A survey on visual content-based video indexing and retrieval","volume":"41","author":"hu","year":"2011","journal-title":"IEEE Trans Syst Man Cybern C (Appl Rev )"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1117\/6.0000005"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-017-4799-2"},{"key":"ref22","article-title":"Query by semantic sketch","author":"rossetto","year":"2019"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-018-7148-1"},{"key":"ref24","first-page":"801","article-title":"Encoder-decoder with atrous separable convolution for semantic image segmentation","author":"chen","year":"0","journal-title":"Proc Eur Conf Comput Vision"},{"key":"ref23","article-title":"Semantic image segmentation with deep convolutional nets and fully connected CRFs","author":"chen","year":"2014"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00244"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1145\/3323873.3325034"},{"key":"ref51","first-page":"609","article-title":"Vireo@ video browser showdown","author":"nguyen","year":"0","journal-title":"Proc Int Conf Multimedia Model"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2010.57"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00907"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-73600-6_45"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1145\/3210539.3210543"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/ISM.2014.38"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-05716-9_53"},{"key":"ref53","first-page":"585","article-title":"Autopiloting feature maps: The deep interactive video exploration (divexplore) system at VBS2019","author":"schoeffmann","year":"0","journal-title":"Proc Int Conf Multimedia Model"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-05716-9_51"},{"key":"ref10","first-page":"1","article-title":"Video browser showdown: A review","author":"schoeffmann","year":"0","journal-title":"Proc Int Conf Content-Based Multimedia Indexing"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1145\/3323873.3325051"},{"key":"ref40","first-page":"2048","article-title":"Show, attend and tell: Neural image caption generation with visual attention","author":"xu","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.1999.790410"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1145\/2911996.2912036"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/DCC.2017.31"},{"key":"ref15","first-page":"3471","article-title":"Face video retrieval via deep learning of binary hash representations","author":"dong","year":"0","journal-title":"Proc 13th AAAI Conf Artif Intell"},{"key":"ref82","first-page":"1","article-title":"Particular object retrieval with integral max-pooling of CNN activations","author":"tolias","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2882155"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.690"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1007\/s00778-015-0391-4"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46759-7_7"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1007\/s11280-018-0541-x"},{"key":"ref83","first-page":"1","article-title":"Particular object retrieval with integral max-pooling of CNN activations","author":"tolias","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350974"},{"key":"ref80","article-title":"Yolov3: An incremental improvement","author":"redmon","year":"2018"},{"key":"ref89","first-page":"438","article-title":"The ITEC collaborative video search system at the video browser showdown","author":"primus","year":"0","journal-title":"Proc Int Conf Multimedia Model"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2019.2895511"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2004.834868"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2017.2729019"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2018.2890361"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-43946-4_14"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2018.2830110"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-05716-9_50"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-016-3661-2"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-05716-9_55"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2014.2333666"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/ICMEW.2018.8551552"},{"key":"ref9","first-page":"29:1","article-title":"Interactive search or sequential browsing? A detailed analysis of the video browser showdown","volume":"15","author":"loko?","year":"2019","journal-title":"ACM Trans Multimedia Comput Commun Appl"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-05710-7_29"},{"key":"ref45","article-title":"TRECVID: Benchmarking video activity detection, video captioning and matching, video storytelling linking and video search","author":"awad","year":"0","journal-title":"Proc TRECVID"},{"key":"ref48","article-title":"Web video in numbers-an analysis of web-video metadata","author":"rossetto","year":"0"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1145\/1631135.1631141"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350906"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2018.2832602"},{"key":"ref44","first-page":"13","article-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","author":"lu","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref43","article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"2018"}],"container-title":["IEEE Transactions on Multimedia"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6046\/9296985\/09037125.pdf?arnumber=9037125","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,5,10]],"date-time":"2022-05-10T14:51:31Z","timestamp":1652194291000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9037125\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"references-count":89,"URL":"https:\/\/doi.org\/10.1109\/tmm.2020.2980944","relation":{},"ISSN":["1520-9210","1941-0077"],"issn-type":[{"value":"1520-9210","type":"print"},{"value":"1941-0077","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021]]}}}