{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,15]],"date-time":"2026-04-15T20:52:11Z","timestamp":1776286331565,"version":"3.50.1"},"reference-count":59,"publisher":"Springer Science and Business Media LLC","issue":"7","license":[{"start":{"date-parts":[[2020,3,13]],"date-time":"2020-03-13T00:00:00Z","timestamp":1584057600000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2020,3,13]],"date-time":"2020-03-13T00:00:00Z","timestamp":1584057600000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2020,7]]},"DOI":"10.1007\/s11263-020-01316-z","type":"journal-article","created":{"date-parts":[[2020,3,13]],"date-time":"2020-03-13T16:03:52Z","timestamp":1584115432000},"page":"1956-1981","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1561,"title":["The Open Images Dataset V4"],"prefix":"10.1007","volume":"128","author":[{"given":"Alina","family":"Kuznetsova","sequence":"first","affiliation":[]},{"given":"Hassan","family":"Rom","sequence":"additional","affiliation":[]},{"given":"Neil","family":"Alldrin","sequence":"additional","affiliation":[]},{"given":"Jasper","family":"Uijlings","sequence":"additional","affiliation":[]},{"given":"Ivan","family":"Krasin","sequence":"additional","affiliation":[]},{"given":"Jordi","family":"Pont-Tuset","sequence":"additional","affiliation":[]},{"given":"Shahab","family":"Kamali","sequence":"additional","affiliation":[]},{"given":"Stefan","family":"Popov","sequence":"additional","affiliation":[]},{"given":"Matteo","family":"Malloci","sequence":"additional","affiliation":[]},{"given":"Alexander","family":"Kolesnikov","sequence":"additional","affiliation":[]},{"given":"Tom","family":"Duerig","sequence":"additional","affiliation":[]},{"given":"Vittorio","family":"Ferrari","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2020,3,13]]},"reference":[{"key":"1316_CR1","doi-asserted-by":"crossref","unstructured":"Alexe, B., Deselaers, T., & Ferrari, V. (2010). What is an object? In CVPR.","DOI":"10.1109\/CVPR.2010.5540226"},{"key":"1316_CR2","doi-asserted-by":"publisher","first-page":"2189","DOI":"10.1109\/TPAMI.2012.28","volume":"34","author":"B Alexe","year":"2012","unstructured":"Alexe, B., Deselaers, T., & Ferrari, V. (2012). Measuring the objectness of image windows. IEEE Transactions on PAMI, 34, 2189\u20132202.","journal-title":"IEEE Transactions on PAMI"},{"key":"1316_CR3","doi-asserted-by":"crossref","unstructured":"Chollet, F. (2017). Xception: Deep learning with depthwise separable convolutions. In CVPR.","DOI":"10.1109\/CVPR.2017.195"},{"key":"1316_CR4","doi-asserted-by":"crossref","unstructured":"Dai, B., Zhang, Y., & Lin, D. (2017). Detecting visual relationships with deep relational networks. In CVPR.","DOI":"10.1109\/CVPR.2017.352"},{"key":"1316_CR5","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., & Fei-fei, L. (2009). ImageNet: A large-scale hierarchical image database. In CVPR.","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"1316_CR6","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1007\/s11263-009-0275-4","volume":"88","author":"M Everingham","year":"2010","unstructured":"Everingham, M., Van Gool, L., Williams, C. K. I., Winn, J., & Zisserman, A. (2010). The PASCAL Visual Object Classes (VOC) Challenge. International Journal of Computer Vision, 88, 303\u2013338.","journal-title":"International Journal of Computer Vision"},{"key":"1316_CR7","unstructured":"Everingham, M., Van\u00a0Gool, L., Williams, C.K.I., Winn, J., & Zisserman, A. (2012). The PASCAL visual object classes challenge 2012 (VOC2012) results. http:\/\/www.pascal-network.org\/challenges\/VOC\/voc2012\/workshop\/index.html."},{"key":"1316_CR8","doi-asserted-by":"publisher","first-page":"98","DOI":"10.1007\/s11263-014-0733-5","volume":"111","author":"M Everingham","year":"2015","unstructured":"Everingham, M., Eslami, S., van Gool, L., Williams, C., Winn, J., & Zisserman, A. (2015). The PASCAL visual object classes challenge: A retrospective. International Journal of Computer Vision, 111, 98\u2013136.","journal-title":"International Journal of Computer Vision"},{"issue":"4","key":"1316_CR9","doi-asserted-by":"publisher","first-page":"594","DOI":"10.1109\/TPAMI.2006.79","volume":"28","author":"L Fei-Fei","year":"2006","unstructured":"Fei-Fei, L., Fergus, R., & Perona, P. (2006). One-shot learning of object categories. IEEE Transactions on Pattern Analysis and Machine Intelligence, 28(4), 594\u2013611.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"1316_CR10","doi-asserted-by":"crossref","unstructured":"Felzenszwalb, P., Girshick, R., & McAllester, D. (2010a). Cascade object detection with deformable part models. In CVPR.","DOI":"10.1109\/CVPR.2010.5539906"},{"issue":"9","key":"1316_CR11","doi-asserted-by":"publisher","first-page":"1627","DOI":"10.1109\/TPAMI.2009.167","volume":"32","author":"P Felzenszwalb","year":"2010","unstructured":"Felzenszwalb, P., Girshick, R., McAllester, D., & Ramanan, D. (2010b). Object detection with discriminatively trained part based models. IEEE Transactions on Pattern Analysis and Machine Intelligence, 32(9), 1627\u20131645.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"1316_CR12","unstructured":"Gao, C., Zou, Y., & Huang, J.B. (2018). iCAN: Instance-centric attention network for human-object interaction detection. In BMVC."},{"key":"1316_CR13","doi-asserted-by":"crossref","unstructured":"Girshick, R. (2015). Fast R-CNN. In ICCV.","DOI":"10.1109\/ICCV.2015.169"},{"key":"1316_CR14","doi-asserted-by":"crossref","unstructured":"Girshick, R., Donahue, J., Darrell, T., & Malik, J. (2014). Rich feature hierarchies for accurate object detection and semantic segmentation. In CVPR.","DOI":"10.1109\/CVPR.2014.81"},{"key":"1316_CR15","doi-asserted-by":"crossref","unstructured":"Gkioxari, G., Girshick, R., Doll\u00e1r, P., & He, K. (2018). Detecting and recognizing human-object interactions. CVPR.","DOI":"10.1109\/CVPR.2018.00872"},{"key":"1316_CR16","unstructured":"Griffin, G., Holub, A., & Perona, P. (2007). The Caltech-256. Technical report, Caltech."},{"key":"1316_CR17","doi-asserted-by":"publisher","first-page":"1775","DOI":"10.1109\/TPAMI.2009.83","volume":"31","author":"A Gupta","year":"2009","unstructured":"Gupta, A., Kembhavi, A., & Davis, L. (2009). Observing human-object interactions: Using spatial and functional compatibility for recognition. IEEE Transactions on Pattern Analysis and Machine Intelligence, 31, 1775\u20131789.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"1316_CR18","unstructured":"Gupta, S., & Malik, J. (2015). Visual semantic role labeling. arXiv preprint arXiv:1505.04474."},{"key":"1316_CR19","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In CVPR.","DOI":"10.1109\/CVPR.2016.90"},{"key":"1316_CR20","unstructured":"Hinton, G. E., Vinyals, O., & Dean, J. (2014). Distilling the knowledge in a neural network. In NeurIPS."},{"key":"1316_CR21","doi-asserted-by":"crossref","unstructured":"Huang, J., Rathod, V., Sun, C., Zhu, M., Korattikara, A., Fathi, A., Fischer, I., Wojna, Z., Song, Y., Guadarrama, S., & Murphy, K. (2017). Speed\/accuracy trade-offs for modern convolutional object detectors. In CVPR.","DOI":"10.1109\/CVPR.2017.351"},{"key":"1316_CR22","unstructured":"Ioffe, S., & Szegedy, C. (2015). Batch normalization: Accelerating deep network training by reducing internal covariate shift. In ICML."},{"key":"1316_CR23","unstructured":"Kolesnikov, A., Kuznetsova, A., Lampert, C., & Ferrari, V. (2018). Detecting visual relationships using box attention. arXiv:1807.02136."},{"issue":"1","key":"1316_CR24","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., Zhu, Y., Groth, O., Johnson, J., Hata, K., Kravitz, J., et al. (2017). Visual genome: Connecting language and vision using crowdsourced dense image annotations. IJCV, 123(1), 32\u201373.","journal-title":"IJCV"},{"key":"1316_CR25","unstructured":"Krizhevsky, A. (2009). Learning multiple layers of features from tiny images. Technical report, University of Toronto."},{"key":"1316_CR26","unstructured":"Krizhevsky, A., Sutskever, I., & Hinton, G.E. (2012). Imagenet classification with deep convolutional neural networks. In NeurIPS."},{"key":"1316_CR27","doi-asserted-by":"crossref","unstructured":"Li, Y., Ouyang, W., Wang, X., & Tang, X. (2017). ViP-CNN: Visual phrase guided convolutional neural network. In CVPR.","DOI":"10.1109\/CVPR.2017.766"},{"key":"1316_CR28","doi-asserted-by":"crossref","unstructured":"Liang, K., Guo, Y., Chang, H., & Chen, X. (2018). Visual relationship detection with deep structural ranking. In AAAI.","DOI":"10.1609\/aaai.v32i1.12274"},{"key":"1316_CR29","doi-asserted-by":"crossref","unstructured":"Liang, X., Lee, L., & Xing, E. P. (2017). Deep variation-structured reinforcement learning for visual relationship and attribute detection. In CVPR.","DOI":"10.1109\/CVPR.2017.469"},{"key":"1316_CR30","doi-asserted-by":"crossref","unstructured":"Lin, T., Goyal, P., Girshick, R., He, K., & Dollar, P. (2017). Focal loss for dense object detection. In ICCV.","DOI":"10.1109\/ICCV.2017.324"},{"key":"1316_CR31","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Maire, M., Belongie, S., Bourdev, L., Girshick, R., Hays, J., Perona P, Ramanan, D., Zitnick, C.L., & Doll\u00e1r, P. (2014). Microsoft COCO: Common objects in context. In ECCV.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"1316_CR32","doi-asserted-by":"crossref","unstructured":"Liu, W., Anguelov, D., Erhan, D., Szegedy, C., Reed, S., Fu, C.Y., & Berg, A.C. (2016). SSD: Single shot multibox detector. In ECCV.","DOI":"10.1007\/978-3-319-46448-0_2"},{"key":"1316_CR33","doi-asserted-by":"crossref","unstructured":"Lu, C., Krishna, R., Bernstein, M., & Fei-Fei, L. (2016). Visual relationship detection with language priors. In European Conference on Computer Vision.","DOI":"10.1007\/978-3-319-46448-0_51"},{"key":"1316_CR34","unstructured":"Mikolov, T., Sutskever, I., Chen, K., Corrado, G.S., & Dean, J. (2013). Distributed representations of words and phrases and their compositionality. In NeurIPS."},{"key":"1316_CR35","doi-asserted-by":"crossref","unstructured":"Papadopoulos, D.P., Uijlings, J.R.R., Keller, F., & Ferrari, V. (2016). We don\u2019t need no bounding-boxes: Training object class detectors using only human verification. In CVPR.","DOI":"10.1109\/CVPR.2016.99"},{"key":"1316_CR36","doi-asserted-by":"crossref","unstructured":"Papadopoulos, D.P., Uijlings, J.R., Keller, F., & Ferrari, V. (2017). Extreme clicking for efficient object annotation. In ICCV.","DOI":"10.1109\/ICCV.2017.528"},{"key":"1316_CR37","doi-asserted-by":"crossref","unstructured":"Peyre, J., Laptev, I., Schmid, C., & Sivic, J. (2017). Weakly-supervised learning of visual relations. In CVPR.","DOI":"10.1109\/ICCV.2017.554"},{"key":"1316_CR38","doi-asserted-by":"publisher","first-page":"601","DOI":"10.1109\/TPAMI.2011.158","volume":"34","author":"A Prest","year":"2012","unstructured":"Prest, A., Schmid, C., & Ferrari, V. (2012). Weakly supervised learning of interactions between humans and objects. IEEE Transactions on Pattern Analysis and Machine Intelligence, 34, 601\u2013614.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"issue":"1","key":"1316_CR39","doi-asserted-by":"publisher","first-page":"145","DOI":"10.1016\/S0893-6080(98)00116-6","volume":"12","author":"N Qian","year":"1999","unstructured":"Qian, N. (1999). On the momentum term in gradient descent learning algorithms. Neural Networks, 12(1), 145\u2013151.","journal-title":"Neural Networks"},{"key":"1316_CR40","doi-asserted-by":"crossref","unstructured":"Redmon, J., & Farhadi, A. (2017). YOLO9000: better, faster, stronger. In CVPR.","DOI":"10.1109\/CVPR.2017.690"},{"key":"1316_CR41","doi-asserted-by":"crossref","unstructured":"Redmon, J., Divvala, S., Girshick, R., & Farhadi, A. (2016). You only look once: Unified, real-time object detection. In CVPR.","DOI":"10.1109\/CVPR.2016.91"},{"key":"1316_CR42","unstructured":"Ren, S., He, K., Girshick, R., & Sun, J. (2015). Faster R-CNN: Towards real-time object detection with region proposal networks. In NeurIPS."},{"key":"1316_CR43","doi-asserted-by":"crossref","unstructured":"Russakovsky, O., Deng, J., Su, H., Krause, J., Satheesh, S., Ma, S., Huang, Z., Karpathy, A., Khosla, A., Bernstein, M., Berg, A., & Fei-Fei, L. (2015). ImageNet large scale visual recognition challenge. IJCV.","DOI":"10.1007\/s11263-015-0816-y"},{"key":"1316_CR44","doi-asserted-by":"crossref","unstructured":"Sandler, M., Howard, A.G., Zhu, M., Zhmoginov, A., & Chen, L. (2018). Mobilenetv2: Inverted residuals and linear bottleneck. In CVPR.","DOI":"10.1109\/CVPR.2018.00474"},{"key":"1316_CR45","unstructured":"Su, H., Deng, J., & Fei-Fei, L. (2012). Crowdsourcing annotations for visual object detection. In AAAI Human Computation Workshop."},{"key":"1316_CR46","doi-asserted-by":"crossref","unstructured":"Sun, C., Shrivastava, A., Singh, S., & Gupta, A. (2017). Revisiting unreasonable effectiveness of data in deep learning era. In ICCV.","DOI":"10.1109\/ICCV.2017.97"},{"key":"1316_CR47","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Liu, W., Jia, Y., Sermanet, P., Reed, S., Anguelov, D., Erhan, D., Vanhoucke, V., & Rabinovich, A. (2015). Going deeper with convolutions. In CVPR.","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"1316_CR48","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Vanhoucke, V., Ioffe, S., Shlens, J., & Wojna, Z. (2016). Rethinking the inception architecture for computer vision. In CVPR.","DOI":"10.1109\/CVPR.2016.308"},{"key":"1316_CR49","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Ioffe, S., Vanhoucke, V., & Alemi, A. (2017). Inception-v4, inception-resnet and the impact of residual connections on learning. In AAAI.","DOI":"10.1609\/aaai.v31i1.11231"},{"key":"1316_CR50","doi-asserted-by":"crossref","unstructured":"Uijlings, J., Popov, S., & Ferrari, V. (2018). Revisiting knowledge transfer for training object class detectors. In CVPR.","DOI":"10.1109\/CVPR.2018.00121"},{"key":"1316_CR51","doi-asserted-by":"publisher","first-page":"154","DOI":"10.1007\/s11263-013-0620-5","volume":"104","author":"JRR Uijlings","year":"2013","unstructured":"Uijlings, J. R. R., van de Sande, K. E. A., Gevers, T., & Smeulders, A. W. M. (2013). Selective search for object recognition. International Journal of Computer Vision, 104, 154\u2013171.","journal-title":"International Journal of Computer Vision"},{"key":"1316_CR52","unstructured":"Veit, A., Alldrin, N., Chechik, G., Krasin, I., Gupta, A., & Belongie, S. (2017). Learning from noisy large-scale datasets with minimal supervision. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 839\u2013847). http:\/\/openaccess.thecvf.com\/content_cvpr_2017\/papers\/Veit_Learning_From_Noisy_CVPR_2017_paper.pdf."},{"key":"1316_CR53","unstructured":"Viola, P., & Jones, M. (2001a). Rapid object detection using a boosted cascade of simple features. In CVPR."},{"key":"1316_CR54","first-page":"4","volume":"4","author":"P Viola","year":"2001","unstructured":"Viola, P., & Jones, M. (2001b). Robust real-time object detection. International Journal of Computer Vision, 4, 4.","journal-title":"International Journal of Computer Vision"},{"key":"1316_CR55","doi-asserted-by":"crossref","unstructured":"Xu, D., Zhu, Y., Choy, C., & Fei-Fei, L. (2017). Scene graph generation by iterative message passing. In Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2017.330"},{"key":"1316_CR56","doi-asserted-by":"crossref","unstructured":"Yao, B., & Fei-Fei, L. (2010). Modeling mutual context of object and human pose in human-object interaction activities. In CVPR.","DOI":"10.1109\/CVPR.2010.5540235"},{"key":"1316_CR57","doi-asserted-by":"crossref","unstructured":"Zellers, R., Yatskar, M., Thomson, S., & Choi, Y. (2018). Neural motifs: Scene graph parsing with global context. In CVPR.","DOI":"10.1109\/CVPR.2018.00611"},{"key":"1316_CR58","doi-asserted-by":"crossref","unstructured":"Zhang, H., Kyaw, Z., Chang, S.F., & Chua, T.S. (2017a). Visual translation embedding network for visual relation detection. In CVPR.","DOI":"10.1109\/CVPR.2017.331"},{"key":"1316_CR59","doi-asserted-by":"crossref","unstructured":"Zhang, H., Kyaw, Z., Yu, J., & Chang, S.F. (2017b). PPR-FCN: weakly supervised visual relation detection via parallel pairwise R-FCN. In ICCV","DOI":"10.1109\/ICCV.2017.454"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-020-01316-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11263-020-01316-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-020-01316-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,18]],"date-time":"2022-10-18T19:46:00Z","timestamp":1666122360000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11263-020-01316-z"}},"subtitle":["Unified Image Classification, Object Detection, and Visual Relationship Detection at Scale"],"short-title":[],"issued":{"date-parts":[[2020,3,13]]},"references-count":59,"journal-issue":{"issue":"7","published-print":{"date-parts":[[2020,7]]}},"alternative-id":["1316"],"URL":"https:\/\/doi.org\/10.1007\/s11263-020-01316-z","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020,3,13]]},"assertion":[{"value":"9 November 2018","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 February 2020","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 March 2020","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}