{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T05:25:08Z","timestamp":1755926708388,"version":"3.40.3"},"publisher-location":"Cham","reference-count":69,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031198052"},{"type":"electronic","value":"9783031198069"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-19806-9_12","type":"book-chapter","created":{"date-parts":[[2022,10,19]],"date-time":"2022-10-19T23:11:54Z","timestamp":1666221114000},"page":"201-219","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":11,"title":["Improving Closed and\u00a0Open-Vocabulary Attribute Prediction Using Transformers"],"prefix":"10.1007","author":[{"given":"Khoi","family":"Pham","sequence":"first","affiliation":[]},{"given":"Kushal","family":"Kafle","sequence":"additional","affiliation":[]},{"given":"Zhe","family":"Lin","sequence":"additional","affiliation":[]},{"given":"Zhihong","family":"Ding","sequence":"additional","affiliation":[]},{"given":"Scott","family":"Cohen","sequence":"additional","affiliation":[]},{"given":"Quan","family":"Tran","sequence":"additional","affiliation":[]},{"given":"Abhinav","family":"Shrivastava","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,10,20]]},"reference":[{"key":"12_CR1","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 6077\u20136086 (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"12_CR2","doi-asserted-by":"crossref","unstructured":"Antol, S., et al.: VQA: visual question answering. In: The IEEE International Conference on Computer Vision (ICCV) (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"12_CR3","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"663","DOI":"10.1007\/978-3-642-15549-9_48","volume-title":"Computer Vision \u2013 ECCV 2010","author":"TL Berg","year":"2010","unstructured":"Berg, T.L., Berg, A.C., Shih, J.: Automatic attribute discovery and characterization from noisy web data. In: Daniilidis, K., Maragos, P., Paragios, N. (eds.) ECCV 2010. LNCS, vol. 6311, pp. 663\u2013676. Springer, Heidelberg (2010). https:\/\/doi.org\/10.1007\/978-3-642-15549-9_48"},{"key":"12_CR4","doi-asserted-by":"publisher","first-page":"409","DOI":"10.1613\/jair.4900","volume":"55","author":"R Bernardi","year":"2016","unstructured":"Bernardi, R., et al.: Automatic description generation from images: a survey of models, datasets, and evaluation measures. J. Artif. Intell. Res. 55, 409\u2013442 (2016)","journal-title":"J. Artif. Intell. Res."},{"key":"12_CR5","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"12_CR6","doi-asserted-by":"crossref","unstructured":"Chao, Y.W., Wang, Z., He, Y., Wang, J., Deng, J.: HICO: a benchmark for recognizing human-object interactions in images. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1017\u20131025 (2015)","DOI":"10.1109\/ICCV.2015.122"},{"key":"12_CR7","unstructured":"Chen, X., Fang, H., Lin, T.Y., Vedantam, R., Gupta, S., Doll\u00e1r, P., Zitnick, C.L.: Microsoft coco captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325 (2015)"},{"key":"12_CR8","doi-asserted-by":"crossref","unstructured":"Chen, Y.C., et al.: UNITER: learning universal image-text representations (2019)","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"12_CR9","doi-asserted-by":"crossref","unstructured":"Chen, Z.M., Wei, X.S., Wang, P., Guo, Y.: Multi-label image recognition with graph convolutional networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5177\u20135186 (2019)","DOI":"10.1109\/CVPR.2019.00532"},{"key":"12_CR10","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"12_CR11","doi-asserted-by":"crossref","unstructured":"Farhadi, A., Endres, I., Hoiem, D., Forsyth, D.: Describing objects by their attributes. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 1778\u20131785. IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206772"},{"key":"12_CR12","unstructured":"Ferrari, V., Zisserman, A.: Learning visual attributes. In: Advances in neural information processing systems, pp. 433\u2013440 (2008)"},{"key":"12_CR13","doi-asserted-by":"crossref","unstructured":"Gao, M., Xing, C., Niebles, J.C., Li, J., Xu, R., Liu, W., Xiong, C.: Towards open vocabulary object detection without human-provided bounding boxes. arXiv preprint arXiv:2111.09452 (2021)","DOI":"10.1007\/978-3-031-20080-9_16"},{"key":"12_CR14","unstructured":"Gu, X., Lin, T.Y., Kuo, W., Cui, Y.: Open-vocabulary object detection via vision and language knowledge distillation. arXiv preprint arXiv:2104.13921 (2021)"},{"key":"12_CR15","doi-asserted-by":"crossref","unstructured":"Guadarrama, S., et al.: Open-vocabulary object retrieval. In: Robotics: science and systems, vol. 2, p. 6 (2014)","DOI":"10.15607\/RSS.2014.X.041"},{"key":"12_CR16","doi-asserted-by":"crossref","unstructured":"Guo, S., et al.: The iMaterialist fashion attribute dataset. In: Proceedings of the IEEE International Conference on Computer Vision Workshops (2019)","DOI":"10.1109\/ICCVW.2019.00377"},{"key":"12_CR17","doi-asserted-by":"crossref","unstructured":"Gupta, T., Schwing, A., Hoiem, D.: VICO: word embeddings from visual co-occurrences. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7425\u20137434 (2019)","DOI":"10.1109\/ICCV.2019.00752"},{"key":"12_CR18","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"12_CR19","doi-asserted-by":"publisher","unstructured":"Honnibal, M., Montani, I., Van Landeghem, S., Boyd, A.: SpaCy: industrial-strength Natural Language Processing in Python (2020). https:\/\/doi.org\/10.5281\/zenodo.1212303","DOI":"10.5281\/zenodo.1212303"},{"key":"12_CR20","doi-asserted-by":"crossref","unstructured":"Huang, H., et al.: Unicoder: a universal language encoder by pre-training with multiple cross-lingual tasks. arXiv preprint arXiv:1909.00964 (2019)","DOI":"10.18653\/v1\/D19-1252"},{"key":"12_CR21","doi-asserted-by":"crossref","unstructured":"Hudson, D.A., Manning, C.D.: GQA: a new dataset for real-world visual reasoning and compositional question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6700\u20136709 (2019)","DOI":"10.1109\/CVPR.2019.00686"},{"key":"12_CR22","doi-asserted-by":"crossref","unstructured":"Huynh, D., Kuen, J., Lin, Z., Gu, J., Elhamifar, E.: Open-vocabulary instance segmentation via robust cross-modal pseudo-labeling. arXiv preprint arXiv:2111.12698 (2021)","DOI":"10.1109\/CVPR52688.2022.00689"},{"key":"12_CR23","doi-asserted-by":"crossref","unstructured":"Jiang, H., Misra, I., Rohrbach, M., Learned-Miller, E., Chen, X.: In defense of grid features for visual question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10267\u201310276 (2020)","DOI":"10.1109\/CVPR42600.2020.01028"},{"key":"12_CR24","unstructured":"Jin, Y., et al.: Decoupling object detection from human-object interaction recognition. arXiv preprint arXiv:2112.06392 (2021)"},{"key":"12_CR25","doi-asserted-by":"crossref","unstructured":"Johnson, J., et al.: Image retrieval using scene graphs. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 3668\u20133678 (2015)","DOI":"10.1109\/CVPR.2015.7298990"},{"key":"12_CR26","doi-asserted-by":"crossref","unstructured":"Kafle, K., Kanan, C.: Visual question answering: Datasets, algorithms, and future challenges. Computer Vision and Image Understanding (2017)","DOI":"10.1109\/ICCV.2017.217"},{"key":"12_CR27","doi-asserted-by":"publisher","first-page":"28","DOI":"10.3389\/frai.2019.00028","volume":"2","author":"K Kafle","year":"2019","unstructured":"Kafle, K., Shrestha, R., Kanan, C.: Challenges and prospects in vision and language research. Front. Artif. Intell. 2, 28 (2019)","journal-title":"Front. Artif. Intell."},{"key":"12_CR28","doi-asserted-by":"crossref","unstructured":"Kamath, A., Singh, M., LeCun, Y., Synnaeve, G., Misra, I., Carion, N.: MDETR-modulated detection for end-to-end multi-modal understanding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1780\u20131790 (2021)","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"12_CR29","doi-asserted-by":"crossref","unstructured":"Kazemzadeh, S., Ordonez, V., Matten, M., Berg, T.: ReferitGame: Referring to objects in photographs of natural scenes. In: Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 787\u2013798 (2014)","DOI":"10.3115\/v1\/D14-1086"},{"key":"12_CR30","doi-asserted-by":"crossref","unstructured":"Kosti, R., Alvarez, J.M., Recasens, A., Lapedriza, A.: Emotion recognition in context. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1667\u20131675 (2017)","DOI":"10.1109\/CVPR.2017.212"},{"issue":"1","key":"12_CR31","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vis. 123(1), 32\u201373 (2017)","journal-title":"Int. J. Comput. Vis."},{"key":"12_CR32","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"121","DOI":"10.1007\/978-3-030-58577-8_8","volume-title":"Computer Vision \u2013 ECCV 2020","author":"X Li","year":"2020","unstructured":"Li, X., et al.: OSCAR: object-semantics aligned pre-training for vision-language tasks. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12375, pp. 121\u2013137. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_8"},{"key":"12_CR33","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"684","DOI":"10.1007\/978-3-319-46466-4_41","volume-title":"Computer Vision \u2013 ECCV 2016","author":"Y Li","year":"2016","unstructured":"Li, Y., Huang, C., Loy, C.C., Tang, X.: Human attribute recognition by deep hierarchical contexts. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9910, pp. 684\u2013700. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46466-4_41"},{"key":"12_CR34","unstructured":"Li, Y.L., et al.: HAKE: Human activity knowledge engine. arXiv preprint arXiv:1904.06539 (2019)"},{"key":"12_CR35","doi-asserted-by":"crossref","unstructured":"Li, Y.L., et al.: PaStaNet: toward human activity knowledge engine. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 382\u2013391 (2020)","DOI":"10.1109\/CVPR42600.2020.00046"},{"key":"12_CR36","doi-asserted-by":"crossref","unstructured":"Liu, Z., Luo, P., Qiu, S., Wang, X., Tang, X.: DeepFashion: powering robust clothes recognition and retrieval with rich annotations. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1096\u20131104 (2016)","DOI":"10.1109\/CVPR.2016.124"},{"key":"12_CR37","doi-asserted-by":"crossref","unstructured":"Liu, Z., Luo, P., Wang, X., Tang, X.: Deep learning face attributes in the wild. In: Proceedings of International Conference on Computer Vision (ICCV) (2015)","DOI":"10.1109\/ICCV.2015.425"},{"key":"12_CR38","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"852","DOI":"10.1007\/978-3-319-46448-0_51","volume-title":"Computer Vision \u2013 ECCV 2016","author":"C Lu","year":"2016","unstructured":"Lu, C., Krishna, R., Bernstein, M., Fei-Fei, L.: Visual relationship detection with language priors. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 852\u2013869. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_51"},{"key":"12_CR39","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: VILBERT: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. arXiv preprint arXiv:1908.02265 (2019)"},{"key":"12_CR40","doi-asserted-by":"crossref","unstructured":"Misra, I., Gupta, A., Hebert, M.: From red wine to red tomato: composition with context. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1792\u20131801 (2017)","DOI":"10.1109\/CVPR.2017.129"},{"key":"12_CR41","doi-asserted-by":"crossref","unstructured":"Naeem, M.F., Xian, Y., Tombari, F., Akata, Z.: Learning graph embeddings for compositional zero-shot learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 953\u2013962 (2021)","DOI":"10.1109\/CVPR46437.2021.00101"},{"key":"12_CR42","doi-asserted-by":"crossref","unstructured":"Nagarajan, T., Grauman, K.: Attributes as operators: factorizing unseen attribute-object compositions. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 169\u2013185 (2018)","DOI":"10.1007\/978-3-030-01246-5_11"},{"key":"12_CR43","unstructured":"Ordonez, V., Kulkarni, G., Berg, T.L.: Im2Text: Describing images using 1 million captioned photographs. In: Neural Information Processing Systems (NIPS) (2011)"},{"key":"12_CR44","doi-asserted-by":"crossref","unstructured":"Parikh, D., Grauman, K.: Relative attributes. In: 2011 International Conference on Computer Vision, pp. 503\u2013510. IEEE (2011)","DOI":"10.1109\/ICCV.2011.6126281"},{"key":"12_CR45","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"85","DOI":"10.1007\/978-3-319-46466-4_6","volume-title":"Computer Vision \u2013 ECCV 2016","author":"G Patterson","year":"2016","unstructured":"Patterson, G., Hays, J.: COCO attributes: attributes for people, animals, and objects. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9910, pp. 85\u2013100. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46466-4_6"},{"key":"12_CR46","doi-asserted-by":"crossref","unstructured":"Pham, K., et al.: Learning to predict visual attributes in the wild. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13018\u201313028 (2021)","DOI":"10.1109\/CVPR46437.2021.01282"},{"key":"12_CR47","doi-asserted-by":"crossref","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., Caicedo, J.C., Hockenmaier, J., Lazebnik, S.: Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2641\u20132649 (2015)","DOI":"10.1109\/ICCV.2015.303"},{"key":"12_CR48","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"647","DOI":"10.1007\/978-3-030-58558-7_38","volume-title":"Computer Vision \u2013 ECCV 2020","author":"J Pont-Tuset","year":"2020","unstructured":"Pont-Tuset, J., Uijlings, J., Changpinyo, S., Soricut, R., Ferrari, V.: Connecting vision and language with localized narratives. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12350, pp. 647\u2013664. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58558-7_38"},{"key":"12_CR49","doi-asserted-by":"crossref","unstructured":"Purushwalkam, S., Nickel, M., Gupta, A., Ranzato, M.: Task-driven modular networks for zero-shot compositional learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3593\u20133602 (2019)","DOI":"10.1109\/ICCV.2019.00369"},{"key":"12_CR50","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. arXiv preprint arXiv:2103.00020 (2021)"},{"key":"12_CR51","doi-asserted-by":"crossref","unstructured":"Saini, N., Pham, K., Shrivastava, A.: Disentangling visual embeddings for attributes and objects. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13658\u201313667 (2022)","DOI":"10.1109\/CVPR52688.2022.01329"},{"key":"12_CR52","doi-asserted-by":"crossref","unstructured":"Sarafianos, N., Xu, X., Kakadiaris, I.A.: Deep imbalanced attribute classification using visual attention aggregation. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 680\u2013697 (2018)","DOI":"10.1007\/978-3-030-01252-6_42"},{"key":"12_CR53","doi-asserted-by":"crossref","unstructured":"Schuster, S., Krishna, R., Chang, A., Fei-Fei, L., Manning, C.D.: Generating semantically precise scene graphs from textual descriptions for improved image retrieval. In: Workshop on Vision and Language (VL15). Association for Computational Linguistics, Lisbon, Portugal (2015)","DOI":"10.18653\/v1\/W15-2812"},{"key":"12_CR54","doi-asserted-by":"publisher","unstructured":"Sharma, P., Ding, N., Goodman, S., Soricut, R.: Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 2556\u20132565. Association for Computational Linguistics, Melbourne, Australia (2018). https:\/\/doi.org\/10.18653\/v1\/P18-1238,http:\/\/aclanthology.org\/P18-1238","DOI":"10.18653\/v1\/P18-1238,"},{"key":"12_CR55","doi-asserted-by":"crossref","unstructured":"Siddiquie, B., Feris, R.S., Davis, L.S.: Image ranking and retrieval based on multi-attribute queries. In: CVPR 2011, pp. 801\u2013808. IEEE (2011)","DOI":"10.1109\/CVPR.2011.5995329"},{"key":"12_CR56","doi-asserted-by":"crossref","unstructured":"Tan, H., Bansal, M.: LXMERT: learning cross-modality encoder representations from transformers. arXiv preprint arXiv:1908.07490 (2019)","DOI":"10.18653\/v1\/D19-1514"},{"key":"12_CR57","doi-asserted-by":"crossref","unstructured":"Wang, S., Thompson, L., Iyyer, M.: Phrase-BERT: improved phrase embeddings from bert with an application to corpus exploration. arXiv preprint arXiv:2109.06304 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.846"},{"key":"12_CR58","unstructured":"Wolf, T., et al.: Transformers: state-of-the-art natural language processing. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations, pp. 38\u201345 (2020)"},{"key":"12_CR59","doi-asserted-by":"crossref","unstructured":"Wu, C., Lin, Z., Cohen, S., Bui, T., Maji, S.: Phrasecut: language-based image segmentation in the wild. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10216\u201310225 (2020)","DOI":"10.1109\/CVPR42600.2020.01023"},{"key":"12_CR60","doi-asserted-by":"crossref","unstructured":"Wu, H., et al.: Unified visual-semantic embeddings: bridging vision and language with structured meaning representations. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6609\u20136618 (2019)","DOI":"10.1109\/CVPR.2019.00677"},{"key":"12_CR61","unstructured":"Wu, Y., et al.: Google\u2019s neural machine translation system: bridging the gap between human and machine translation. arXiv preprint arXiv:1609.08144 (2016)"},{"issue":"9","key":"12_CR62","doi-asserted-by":"publisher","first-page":"2251","DOI":"10.1109\/TPAMI.2018.2857768","volume":"41","author":"Y Xian","year":"2018","unstructured":"Xian, Y., Lampert, C.H., Schiele, B., Akata, Z.: Zero-shot learning-a comprehensive evaluation of the good, the bad and the ugly. IEEE Trans. Pattern Anal. Mach. Intell. 41(9), 2251\u20132265 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"12_CR63","doi-asserted-by":"crossref","unstructured":"Xu, D., Zhu, Y., Choy, C.B., Fei-Fei, L.: Scene graph generation by iterative message passing. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 5410\u20135419 (2017)","DOI":"10.1109\/CVPR.2017.330"},{"key":"12_CR64","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"606","DOI":"10.1007\/978-3-030-58592-1_36","volume-title":"Computer Vision \u2013 ECCV 2020","author":"A Zareian","year":"2020","unstructured":"Zareian, A., Karaman, S., Chang, S.-F.: Bridging knowledge graphs to generate scene graphs. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12368, pp. 606\u2013623. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58592-1_36"},{"key":"12_CR65","doi-asserted-by":"crossref","unstructured":"Zareian, A., Rosa, K.D., Hu, D.H., Chang, S.F.: Open-vocabulary object detection using captions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14393\u201314402 (2021)","DOI":"10.1109\/CVPR46437.2021.01416"},{"key":"12_CR66","doi-asserted-by":"crossref","unstructured":"Zhang, H., Kyaw, Z., Chang, S.F., Chua, T.S.: Visual translation embedding network for visual relation detection. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 5532\u20135540 (2017)","DOI":"10.1109\/CVPR.2017.331"},{"key":"12_CR67","doi-asserted-by":"crossref","unstructured":"Zhang, P., et al.: VinVL: Making visual representations matter in vision-language models. arXiv preprint arXiv:2101.00529 (2021)","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"12_CR68","doi-asserted-by":"crossref","unstructured":"Zhao, H., Puig, X., Zhou, B., Fidler, S., Torralba, A.: Open vocabulary scene parsing. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2002\u20132010 (2017)","DOI":"10.1109\/ICCV.2017.221"},{"key":"12_CR69","doi-asserted-by":"crossref","unstructured":"Zhou, B., Lapedriza, A., Khosla, A., Oliva, A., Torralba, A.: Places: a 10 million image database for scene recognition. In: IEEE Transactions on Pattern Analysis and Machine Intelligence (2017)","DOI":"10.1167\/17.10.296"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-19806-9_12","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,6]],"date-time":"2024-10-06T04:32:23Z","timestamp":1728189143000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-19806-9_12"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031198052","9783031198069"],"references-count":69,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-19806-9_12","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"20 October 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}