{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T07:39:08Z","timestamp":1740123548784,"version":"3.37.3"},"reference-count":66,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2017,6,9]],"date-time":"2017-06-09T00:00:00Z","timestamp":1496966400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0"},{"start":{"date-parts":[[2017,6,9]],"date-time":"2017-06-09T00:00:00Z","timestamp":1496966400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0"}],"funder":[{"DOI":"10.13039\/100006754","name":"Army Research Laboratory","doi-asserted-by":"crossref","award":["W911NF-10-2-0060"],"award-info":[{"award-number":["W911NF-10-2-0060"]}],"id":[{"id":"10.13039\/100006754","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["522954-IIS"],"award-info":[{"award-number":["522954-IIS"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2017,9]]},"DOI":"10.1007\/s11263-017-1018-6","type":"journal-article","created":{"date-parts":[[2017,6,9]],"date-time":"2017-06-09T11:24:56Z","timestamp":1497007496000},"page":"312-334","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["Sentence Directed Video Object Codiscovery"],"prefix":"10.1007","volume":"124","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9882-4929","authenticated-orcid":false,"given":"Haonan","family":"Yu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jeffrey Mark","family":"Siskind","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2017,6,9]]},"reference":[{"issue":"11","key":"1018_CR1","doi-asserted-by":"publisher","first-page":"2274","DOI":"10.1109\/TPAMI.2012.120","volume":"34","author":"R Achanta","year":"2012","unstructured":"Achanta, R., Shaji, A., Smith, K., Lucchi, A., Fua, P., & Susstrunk, S. (2012). SLIC superpixels compared to state-of-the-art superpixel methods. IEEE Transactions on Pattern Analysis and Machine Intelligence, 34(11), 2274\u20132282.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"1018_CR2","unstructured":"Alexe, B., Deselaers, T., & Ferrari, V. (2010). What is an object? In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 73\u201380)."},{"key":"1018_CR3","unstructured":"Andres, B., Beier, T., & Kappes, J. H. (2012). OpenGM: A C++ library for discrete graphical models. CoRR 1206.0111."},{"key":"1018_CR4","doi-asserted-by":"crossref","unstructured":"Arbelaez, P., Pont-Tuset, J., Barron, J., Marqu\u00e9s, F., & Malik, J. (2014). Multiscale combinatorial grouping. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 328\u2013335).","DOI":"10.1109\/CVPR.2014.49"},{"key":"1018_CR5","unstructured":"Barbu, A., Bridge, A., Burchill, Z., Coroian, D., Dickinson, S., & Fidler, S,. et al. (2012). Video in sentences out. In Proceedings of the conference on uncertainty in artificial intelligence (pp. 102\u2013112)."},{"key":"1018_CR6","doi-asserted-by":"crossref","unstructured":"Berg, T. L., Berg, A. C., Edwards, J., Maire, M., White, R., Teh, Y. W., Learned-Miller, E. G., & Forsyth, D. A. (2004). Names and faces in the news. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 848\u2013854).","DOI":"10.1109\/CVPR.2004.1315253"},{"key":"1018_CR7","unstructured":"Blaschko, M., Vedaldi, A., & Zisserman, A. (2010). Simultaneous object detection and ranking with weak supervision. In Advances in neural information processing systems (pp. 235\u2013243)."},{"key":"1018_CR8","doi-asserted-by":"crossref","unstructured":"Bojanowski, P., Lajugie, R., Grave, E., Bach, F., Laptev, I., Ponce, J., & Schmid, C. (2015). Weakly-supervised alignment of video with text. In Proceedings of the IEEE international conference on computer vision (pp. 4462\u20134470).","DOI":"10.1109\/ICCV.2015.507"},{"key":"1018_CR9","doi-asserted-by":"crossref","unstructured":"Bosch, A., Zisserman, A., & Munoz, X. (2007). Image classification using random forests and ferns. In Proceedings of the IEEE international conference on computer vision (pp. 1\u20138).","DOI":"10.1109\/ICCV.2007.4409066"},{"key":"1018_CR10","unstructured":"Bradski, G.R. (1998). Computer vision face tracking for use in a perceptual user interface, Intel Technology Journal, Q2(Q2), 214\u2013219."},{"key":"1018_CR11","unstructured":"Bylinskii, Z., Judd, T., Borji, A., Itti, L., Durand, F., Oliva, A., & Torralba, A. (2012). MIT saliency benchmark"},{"key":"1018_CR12","doi-asserted-by":"crossref","unstructured":"Cheng, M. M., Zhang, Z., Lin, W. Y., & Torr, P. (2014). BING: Binarized normed gradients for objectness estimation at 300fps. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 3286\u20133293).","DOI":"10.1109\/CVPR.2014.414"},{"key":"1018_CR13","doi-asserted-by":"crossref","unstructured":"Cinbis, R. G., Verbeek, J., & Schmid, C. (2014). Multi-fold mil training for weakly supervised object localization. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 2409\u20132416).","DOI":"10.1109\/CVPR.2014.309"},{"key":"1018_CR14","unstructured":"Clarke, J., Goldwasser, D., Chang, M. W., & Roth, D. (2010). Driving semantic parsing from the world\u2019s response. In Proceedings of the conference on computational natural language learning (pp. 18\u201327)."},{"key":"1018_CR15","doi-asserted-by":"crossref","unstructured":"Dalal, N., & Triggs, B. (2005). Histograms of oriented gradients for human detection. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 886\u2013893).","DOI":"10.1109\/CVPR.2005.177"},{"key":"1018_CR16","doi-asserted-by":"crossref","unstructured":"Das, P., Xu, C., Doell, R. F., & Corso, J. J. (2013). A thousand frames in just a few words: Lingual description of videos through latent topics and sparse object stitching. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 2634\u20132641).","DOI":"10.1109\/CVPR.2013.340"},{"issue":"2","key":"1018_CR17","doi-asserted-by":"publisher","first-page":"179","DOI":"10.1207\/s15516709cog1402_1","volume":"14","author":"JL Elman","year":"1990","unstructured":"Elman, J. L. (1990). Finding structure in time. Cognitive Science, 14(2), 179\u2013211.","journal-title":"Cognitive Science"},{"key":"1018_CR18","doi-asserted-by":"crossref","unstructured":"Farneb\u00e4ck, G. (2003). Two-frame motion estimation based on polynomial expansion. In Proceedings of the scandinavian conference on image analysis (pp. 363\u2013370).","DOI":"10.1007\/3-540-45103-X_50"},{"issue":"1\u20132","key":"1018_CR19","doi-asserted-by":"publisher","first-page":"95","DOI":"10.1002\/nav.3800030109","volume":"3","author":"M Frank","year":"1956","unstructured":"Frank, M., & Wolfe, P. (1956). An algorithm for quadratic programming. Naval Research Logistics, 3(1\u20132), 95\u2013110.","journal-title":"Naval Research Logistics"},{"key":"1018_CR20","doi-asserted-by":"crossref","unstructured":"Girshick, R. (2015). Fast R-CNN. In Proceedings of the IEEE international conference on computer vision (pp. 1440\u20131448).","DOI":"10.1109\/ICCV.2015.169"},{"key":"1018_CR21","unstructured":"Graves, A., Wayne, G., & Danihelka, I. (2014). Neural Turing machines. CoRR 1410.5401."},{"key":"1018_CR22","doi-asserted-by":"crossref","unstructured":"Guadarrama, S., Krishnamoorthy, N., Malkarnenkar, G., Venugopalan, S., Mooney, R., Darrell, T., & Saenko, K. (2013). YouTube2Text: Recognizing and describing arbitrary activities using semantic hierarchies and zero-shot recognition. In Proceedings of the IEEE international conference on computer vision (pp. 2712\u20132719).","DOI":"10.1109\/ICCV.2013.337"},{"key":"1018_CR23","doi-asserted-by":"crossref","unstructured":"Gupta, A., & Davis, L. S. (2008). Beyond nouns: Exploiting prepositions and comparative adjectives for learning visual classifiers. In Proceedings of the european conference on computer vision (pp. 16\u201329).","DOI":"10.1007\/978-3-540-88682-2_3"},{"issue":"8","key":"1018_CR24","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., & Schmidhuber, J. (1997). Long short-term memory. Neural Computation, 9(8), 1735\u20131780.","journal-title":"Neural Computation"},{"key":"1018_CR25","doi-asserted-by":"crossref","unstructured":"Jamieson, M., Eskin, Y., Fazly, A., Stevenson, S., & Dickinson, S. (2010a) Discovering multipart appearance models from captioned images. In Proceedings of the European conference on computer vision (pp. 183\u2013196).","DOI":"10.1007\/978-3-642-15555-0_14"},{"issue":"1","key":"1018_CR26","doi-asserted-by":"publisher","first-page":"148","DOI":"10.1109\/TPAMI.2008.283","volume":"32","author":"M Jamieson","year":"2010","unstructured":"Jamieson, M., Fazly, A., Stevenson, S., Dickinson, S. J., & Wachsmuth, S. (2010b). Using language to learn structured appearance models for image annotation. IEEE Transactions on Pattern Analysis and Machine Intelligence, 32(1), 148\u2013164.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"1018_CR27","doi-asserted-by":"crossref","unstructured":"Jiang, M., Huang, S., Duan, J., & Zhao, Q. (2015). SALICON: Saliency in context. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 1072\u20131080).","DOI":"10.1109\/CVPR.2015.7298710"},{"key":"1018_CR28","doi-asserted-by":"crossref","unstructured":"Joulin, A., Tang, K., & Fei-Fei, L. (2014). Efficient image and video co-localization with Frank-Wolfe algorithm. In Proceedings of the European conference on computer vision (pp. 253\u2013268).","DOI":"10.1007\/978-3-319-10599-4_17"},{"key":"1018_CR29","doi-asserted-by":"crossref","unstructured":"Kong, C., Lin, D., Bansal, M., Urtasun, R., & Fidler, S. (2014). What are you talking about? Text-to-image coreference. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 3558\u20133565).","DOI":"10.1109\/CVPR.2014.455"},{"issue":"8","key":"1018_CR30","doi-asserted-by":"publisher","first-page":"951","DOI":"10.1177\/0278364913478446","volume":"32","author":"HS Koppula","year":"2013","unstructured":"Koppula, H. S., Gupta, R., & Saxena, A. (2013). Learning human activities and object affordances from RGB-D videos. International Journal of Robotics Research, 32(8), 951\u2013970.","journal-title":"International Journal of Robotics Research"},{"key":"1018_CR31","doi-asserted-by":"crossref","unstructured":"Kuehne, H., Jhuang, H., Garrote, E., Poggio, T., & Serre, T. (2011). HMDB: A large video database for human motion recognition. In Proceedings of the IEEE international conference on computer vision (pp. 2556\u20132563).","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"1018_CR32","doi-asserted-by":"crossref","unstructured":"Kwak, S., Cho, M., Laptev, I., Ponce, J., & Schmid, C. (2015). Unsupervised object discovery and tracking in video collections. In Proceedings of the IEEE international conference on computer vision (pp. 3173\u20133181).","DOI":"10.1109\/ICCV.2015.363"},{"key":"1018_CR33","doi-asserted-by":"crossref","unstructured":"Lee, Y. J., & Grauman, K. (2011). Learning the easy things first: Self-paced visual category discovery. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 1721\u20131728).","DOI":"10.1109\/CVPR.2011.5995523"},{"key":"1018_CR34","doi-asserted-by":"crossref","unstructured":"Lin, D., Fidler, S., Kong, C., & Urtasun, R. (2014). Visual semantic search: Retrieving videos via complex textual queries. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 2657\u20132664).","DOI":"10.1109\/CVPR.2014.340"},{"issue":"2","key":"1018_CR35","doi-asserted-by":"publisher","first-page":"91","DOI":"10.1023\/B:VISI.0000029664.99615.94","volume":"60","author":"DG Lowe","year":"2004","unstructured":"Lowe, D. G. (2004). Distinctive image features from scale-invariant keypoints. International Journal of Computer Vision, 60(2), 91\u2013110.","journal-title":"International Journal of Computer Vision"},{"key":"1018_CR36","unstructured":"Luo, J., Caputo, B., & Ferrari, V. (2009). Who\u2019s doing what: Joint modeling of names and verbs for simultaneous face and pose annotation. In Advances in neural information processing systems (pp. 1168\u20131176)."},{"key":"1018_CR37","doi-asserted-by":"crossref","unstructured":"Mao, J., Huang, J., Toshev, A., Camburu, O., Yuille, A., & Murphy, K. (2016). Generation and comprehension of unambiguous object descriptions. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 11\u201320).","DOI":"10.1109\/CVPR.2016.9"},{"key":"1018_CR38","doi-asserted-by":"crossref","unstructured":"Marsza\u0142ek, M., Laptev, I., & Schmid, C. (2009). Actions in context. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 2929\u20132936).","DOI":"10.1109\/CVPR.2009.5206557"},{"issue":"1","key":"1018_CR39","doi-asserted-by":"publisher","first-page":"71","DOI":"10.1162\/0891201053630264","volume":"31","author":"M Palmer","year":"2005","unstructured":"Palmer, M., Gildea, D., & Kingsbury, P. (2005). The proposition bank: An annotated corpus of semantic roles. Computational Linguistics, 31(1), 71\u2013106.","journal-title":"Computational Linguistics"},{"key":"1018_CR40","unstructured":"Pearl, J. (1982). Reverend Bayes on inference engines: A distributed hierarchical approach. In Proceedings of the conference on artificial intelligence (pp. 133\u2013136)."},{"key":"1018_CR41","doi-asserted-by":"crossref","unstructured":"Plummer, B. A., Wang, L., Cervantes, C. M., Caicedo, J. C., Hockenmaier, J., & Lazebnik, S. (2015). Flickr30k entities: Collecting region-to-phrase correspondences for richer image-to-sentence models. In Proceedings of the IEEE international conference on computer vision (pp. 2641\u20132649).","DOI":"10.1109\/ICCV.2015.303"},{"key":"1018_CR42","doi-asserted-by":"crossref","unstructured":"Prest, A., Leistner, C., Civera, J., Schmid, C., & Ferrari, V. (2012). Learning object class detectors from weakly annotated video. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 3282\u20133289).","DOI":"10.1109\/CVPR.2012.6248065"},{"key":"1018_CR43","doi-asserted-by":"crossref","unstructured":"Ramanathan, V., Joulin, A., Liang, P., & Fei-Fei, L. (2014). Linking people with \u201ctheir\u201d names using coreference resolution. In Proceedings of the european conference on computer vision (pp. 95\u2013110).","DOI":"10.1007\/978-3-319-10590-1_7"},{"key":"1018_CR44","doi-asserted-by":"crossref","unstructured":"Rodriguez, M. D., Ahmed, J., & Shah, M. (2008). Action MACH: A spatio-temporal maximum average correlation height filter for action recognition. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 1\u20138).","DOI":"10.1109\/CVPR.2008.4587727"},{"key":"1018_CR45","doi-asserted-by":"crossref","unstructured":"Rohrbach, A., Rohrbach, M., Qiu, W., Friedrich, A., Pinkal, M., & Schiele, B. (2014). Coherent multi-sentence video description with variable level of detail. In German conference on pattern recognition (pp. 184\u2013195).","DOI":"10.1007\/978-3-319-11752-2_15"},{"key":"1018_CR46","doi-asserted-by":"crossref","unstructured":"Rohrbach, A., Rohrbach, M., Tandon, N., & Schiele, B. (2015). A dataset for movie description. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 3202\u20133212).","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"1018_CR47","doi-asserted-by":"crossref","unstructured":"Rohrbach, M., Amin, S., Andriluka, M., & Schiele, B. (2012). A database for fine grained activity detection of cooking activities. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 1194\u20131201).","DOI":"10.1109\/CVPR.2012.6247801"},{"key":"1018_CR48","doi-asserted-by":"crossref","unstructured":"Rubinstein, M., Joulin, A., Kopf, J., & Liu, C. (2013). Unsupervised joint object discovery and segmentation in internet images. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 1939\u20131946).","DOI":"10.1109\/CVPR.2013.253"},{"issue":"3","key":"1018_CR49","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., Deng, J., Su, H., Krause, J., Satheesh, S., Ma, S., et al. (2015). Image net large scale visual recognition challenge. International Journal of Computer Vision, 115(3), 211\u2013252.","journal-title":"International Journal of Computer Vision"},{"key":"1018_CR50","doi-asserted-by":"crossref","unstructured":"Schulter, S., Leistner, C., Roth, P. M., & Bischof, H. (2013). Unsupervised object discovery and segmentation in videos. In Proceedings of the british machine vision conference (pp. 53.1\u201353.12).","DOI":"10.5244\/C.27.53"},{"key":"1018_CR51","doi-asserted-by":"crossref","unstructured":"Sigurdsson, G. A., Varol, G., Wang, X., Farhadi, A., Laptev, I., & Gupta, A. (2016). Hollywood in homes: Crowdsourcing data collection for activity understanding. In Proceedings of the european conference on computer vision (pp. 510\u2013526).","DOI":"10.1007\/978-3-319-46448-0_31"},{"key":"1018_CR52","unstructured":"Simonyan, K., & Zisserman, A. (2015). Very deep convolutional networks for large-scale image recognition. In International conference on learning representations"},{"key":"1018_CR53","unstructured":"Socher, R., Bauer, J., Manning, C. D., & Ng, A. Y. (2013). Parsing with compositional vector grammars. In Proceedings of the annual meeting of the association for computational linguistics (pp. 455\u2013465)."},{"key":"1018_CR54","doi-asserted-by":"crossref","unstructured":"Srikantha, A., & Gall, J. (2014). Discovering object classes from activities. In Proceedings of the european conference on computer vision (pp. 415\u2013430).","DOI":"10.1007\/978-3-319-10599-4_27"},{"issue":"C","key":"1018_CR55","doi-asserted-by":"publisher","first-page":"138","DOI":"10.1016\/j.cviu.2016.09.006","volume":"156","author":"A Srikantha","year":"2017","unstructured":"Srikantha, A., & Gall, J. (2017). Weak supervision for detecting object classes from activities. Computer Vision and Image Understanding., 156(C), 138\u2013150.","journal-title":"Computer Vision and Image Understanding."},{"key":"1018_CR56","doi-asserted-by":"crossref","unstructured":"Tang, K., Joulin, A., Li, J., & Fei-Fei, L. (2014). Co-localization in real-world images. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 1464\u20131471).","DOI":"10.1109\/CVPR.2014.190"},{"key":"1018_CR57","unstructured":"Torabi, A., Chris, P., Hugo, L., & Aaron, C. (2015). Using descriptive video services to create a large data source for video annotation research. CoRR 1503.01070"},{"issue":"2","key":"1018_CR58","doi-asserted-by":"publisher","first-page":"284","DOI":"10.1007\/s11263-009-0271-8","volume":"88","author":"T Tuytelaars","year":"2010","unstructured":"Tuytelaars, T., Lampert, C. H., Blaschko, M. B., & Buntine, W. L. (2010). Unsupervised object discovery: A comparison. International Journal of Computer Vision, 88(2), 284\u2013302.","journal-title":"International Journal of Computer Vision"},{"key":"1018_CR59","unstructured":"Venugopalan, S., Xu, H., Donahue, J., Rohrbach, M., Mooney, R. J., & Saenko, K. (2015). Translating videos to natural language using deep recurrent neural networks. In The conference of the North American chapter of the association for computational linguistics: Human language technologies (pp. 1494\u20131504)."},{"key":"1018_CR60","doi-asserted-by":"crossref","unstructured":"Wang, L., Hua, G., Sukthankar, R., Xue, J., & Zheng, N. (2014). Video object discovery and co-segmentation with extremely weak supervision. In Proceedings of the european conference on computer vision (pp. 640\u2013655).","DOI":"10.1007\/978-3-319-10593-2_42"},{"key":"1018_CR61","unstructured":"Wong, Y. W., & Mooney, R. J. (2007). Learning synchronous grammars for semantic parsing with lambda calculus. In Proceedings of the annual meeting of the association for computational linguistics (pp. 960\u2013967)."},{"key":"1018_CR62","doi-asserted-by":"crossref","unstructured":"Xiao, F., & Lee, Y. J. (2016). Track and segment: An iterative unsupervised approach for video object proposals. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 933\u2013942).","DOI":"10.1109\/CVPR.2016.107"},{"key":"1018_CR63","doi-asserted-by":"crossref","first-page":"601","DOI":"10.1613\/jair.4556","volume":"52","author":"H Yu","year":"2015","unstructured":"Yu, H., Siddharth, N., Barbu, A., & Siskind, J. M. (2015). A compositional framework for grounding language inference, generation, and acquisition in video. Journal of Artificial Intelligence Research, 52, 601\u2013713.","journal-title":"Journal of Artificial Intelligence Research"},{"key":"1018_CR64","doi-asserted-by":"crossref","unstructured":"Yu, H., Wang, J., Huang, Z., Yang, Y., & Xu, W. (2016). Video paragraph captioning using hierarchical recurrent neural networks. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 4584\u20134593).","DOI":"10.1109\/CVPR.2016.496"},{"key":"1018_CR65","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Kiros, R., Zemel, R., Salakhutdinov, R., Urtasun, R., Torralba, A., & Fidler, S. (2015). Aligning books and movies: Towards story-like visual explanations by watching movies and reading books. In Proceedings of the IEEE international conference on computer vision (pp. 19\u201327).","DOI":"10.1109\/ICCV.2015.11"},{"key":"1018_CR66","doi-asserted-by":"crossref","unstructured":"Zitnick, C. L., & Doll\u00e1r, P. (2014). Edge boxes: Locating object proposals from edges. In Proceedings of the european conference on computer vision (pp. 391\u2013405).","DOI":"10.1007\/978-3-319-10602-1_26"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11263-017-1018-6\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-017-1018-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-017-1018-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,5,17]],"date-time":"2020-05-17T07:16:05Z","timestamp":1589699765000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11263-017-1018-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017,6,9]]},"references-count":66,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2017,9]]}},"alternative-id":["1018"],"URL":"https:\/\/doi.org\/10.1007\/s11263-017-1018-6","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"type":"print","value":"0920-5691"},{"type":"electronic","value":"1573-1405"}],"subject":[],"published":{"date-parts":[[2017,6,9]]},"assertion":[{"value":"2 July 2016","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 May 2017","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 June 2017","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}