{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T08:24:27Z","timestamp":1774599867799,"version":"3.50.1"},"publisher-location":"Cham","reference-count":97,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031200588","type":"print"},{"value":"9783031200595","type":"electronic"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-20059-5_29","type":"book-chapter","created":{"date-parts":[[2022,10,28]],"date-time":"2022-10-28T16:02:50Z","timestamp":1666972970000},"page":"502-520","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":11,"title":["FindIt: Generalized Localization with\u00a0Natural Language Queries"],"prefix":"10.1007","author":[{"given":"Weicheng","family":"Kuo","sequence":"first","affiliation":[]},{"given":"Fred","family":"Bertsch","sequence":"additional","affiliation":[]},{"given":"Wei","family":"Li","sequence":"additional","affiliation":[]},{"given":"A. J.","family":"Piergiovanni","sequence":"additional","affiliation":[]},{"given":"Mohammad","family":"Saffar","sequence":"additional","affiliation":[]},{"given":"Anelia","family":"Angelova","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,10,29]]},"reference":[{"key":"29_CR1","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Bottom-up and top-down attention for image captioning and visual question answering. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"29_CR2","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"397","DOI":"10.1007\/978-3-030-01246-5_24","volume-title":"Computer Vision \u2013 ECCV 2018","author":"A Bansal","year":"2018","unstructured":"Bansal, A., Sikka, K., Sharma, G., Chellappa, R., Divakaran, A.: Zero-shot object detection. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11205, pp. 397\u2013414. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01246-5_24"},{"key":"29_CR3","doi-asserted-by":"crossref","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers (2020). https:\/\/arxiv.org\/abs\/2005.12872","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"29_CR4","doi-asserted-by":"crossref","unstructured":"Changpinyo, S., Pont-Tuset, J., Ferrari, V., Soricut, R.: Telling the what while pointing to the where: multimodal queries for image retrieval. Arxiv: 2102.04980 (2021)","DOI":"10.1109\/ICCV48922.2021.01192"},{"key":"29_CR5","doi-asserted-by":"crossref","unstructured":"Changpinyo, S., Sharma, P., Ding, N., Soricut, R.: Conceptual 12m: pushing web-scale image-text pre-training to recognize long-tail visual concepts. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"29_CR6","unstructured":"Chen, X., et al.: Microsoft coco captions: data collection and evaluation server. Arxiv: https:\/\/arxiv.org\/abs\/1504.00325 (2015)"},{"key":"29_CR7","unstructured":"Chen, X., Ma, L., Chen, J., Jie, Z., Liu, W., Luo, J.: Real-time referring expression comprehension by single-stage grounding network. arXiv preprint arXiv:1812.03426 (2018)"},{"key":"29_CR8","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"104","DOI":"10.1007\/978-3-030-58577-8_7","volume-title":"Computer Vision \u2013 ECCV 2020","author":"YC Chen","year":"2020","unstructured":"Chen, Y.C., et al.: UNITER: UNiversal image-TExt representation learning. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12375, pp. 104\u2013120. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_7"},{"key":"29_CR9","doi-asserted-by":"crossref","unstructured":"Chen, Z., Wang, P., Ma, L., Wong, K.Y.K., Wu, Q.: Cops-ref: a new dataset and task on compositional referring expression comprehension. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01010"},{"key":"29_CR10","unstructured":"Cho, J., Lei, J., Tan, H., Bansal, M.: Unifying vision-and-language tasks via text generation. Arxiv: 2102.02779 (2021)"},{"key":"29_CR11","doi-asserted-by":"crossref","unstructured":"Das, A., et al.: Visual dialog. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.121"},{"key":"29_CR12","doi-asserted-by":"crossref","unstructured":"Deng, J., Yang, Z., Chen, T., Zhou, W., Li, H.: TransVG: end-to-end visual grounding with transformers. ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00179"},{"key":"29_CR13","doi-asserted-by":"crossref","unstructured":"Desai, K., Johnson, J.: VirTex: learning visual representations from textual annotations. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.01101"},{"key":"29_CR14","doi-asserted-by":"crossref","unstructured":"Dhamija, A.R., Gunther, M., Ventura, J., Boult, T.E.: The overlooked elephant of object detection: open set. In: WACV (2020)","DOI":"10.1109\/WACV45572.2020.9093355"},{"key":"29_CR15","doi-asserted-by":"crossref","unstructured":"Gan, C., Li, Y., Li, H., Sun, C., Gong, B.: VQS: linking segmentations to questions and answers for supervised attention in VQA and question-focused semantic segmentation. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.201"},{"key":"29_CR16","unstructured":"Gan, Z., Chen, Y.C., Li, L., Zhu, C., Cheng, Y., Liu, J.: Large-scale adversarial training for vision-and-language representation learning. In: NeurIPS (2020)"},{"key":"29_CR17","doi-asserted-by":"crossref","unstructured":"Girshick, R., Donahue, J., Darrell, T., Malik, J.: Rich feature hierarchies for accurate object detection and semantic segmentation. In: CVPR (2014)","DOI":"10.1109\/CVPR.2014.81"},{"key":"29_CR18","unstructured":"Girshick, R., Radosavovic, I., Gkioxari, G., Doll\u00e1r, P., He, K.: Detectron (2018). https:\/\/github.com\/facebookresearch\/detectron"},{"key":"29_CR19","unstructured":"Gu, X., Lin, T., Kuo, W., Cui, Y.: Zero-shot detection via vision and language knowledge distillation. CoRR abs\/2104.13921 (2021). https:\/\/arxiv.org\/abs\/2104.13921"},{"key":"29_CR20","unstructured":"Gupta, T., Kamath, A., Kembhavi, A., Hoiem2, D.: Towards general purpose vision systems. arxiv.org\/abs\/2104.00743 (2021)"},{"key":"29_CR21","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"752","DOI":"10.1007\/978-3-030-58580-8_44","volume-title":"Computer Vision \u2013 ECCV 2020","author":"T Gupta","year":"2020","unstructured":"Gupta, T., Vahdat, A., Chechik, G., Yang, X., Kautz, J., Hoiem, D.: Contrastive learning for weakly supervised phrase grounding. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12348, pp. 752\u2013768. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58580-8_44"},{"key":"29_CR22","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Dollar, P., Girshick, R.: Mask R-CNN. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.322"},{"key":"29_CR23","doi-asserted-by":"crossref","unstructured":"Hinami, R., Satoh, S.: Discriminative learning of open-vocabulary object retrieval and localization by negative phrase augmentation. In: Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing (2018)","DOI":"10.18653\/v1\/D18-1281"},{"key":"29_CR24","unstructured":"Hong, R., Liu, D., Mo, X., He, X., Zhang, H.: Learning to compose and reason with language tree structures for visual grounding. TPAMI (2019)"},{"key":"29_CR25","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"108","DOI":"10.1007\/978-3-319-46448-0_7","volume-title":"Computer Vision \u2013 ECCV 2016","author":"R Hu","year":"2016","unstructured":"Hu, R., Rohrbach, M., Darrell, T.: Segmentation from natural language expressions. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 108\u2013124. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_7"},{"key":"29_CR26","doi-asserted-by":"crossref","unstructured":"Hu, R., Rohrbach, M., Andreas, J., Darrell, T., Saenko, K.: Modeling relationships in referential expressions with compositional modular networks. In: CVPR, pp. 1115\u20131124 (2017)","DOI":"10.1109\/CVPR.2017.470"},{"key":"29_CR27","doi-asserted-by":"crossref","unstructured":"Hu, R., Singh, A.: Unit: multimodal multitask learning with a unified transformer. arxiv.org\/abs\/2102.10772 (2021)","DOI":"10.1109\/ICCV48922.2021.00147"},{"key":"29_CR28","unstructured":"Huang, G., Pang, B., Zhu, Z., Rivera, C., Soricut, R.: Multimodal pretraining for dense video captioning. In: AACL-IJCNLP (2020)"},{"key":"29_CR29","doi-asserted-by":"crossref","unstructured":"Huang, Z., Zeng, Z., Huang, Y., Liu, B., Fu, D., Fu, J.: Seeing out of the box: end-to-end pre-training for vision-language representation learning. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.01278"},{"key":"29_CR30","unstructured":"Hudson, D.A., Manning, C.D.: GQA: a new dataset for compositional question answering over realworld images. In: CVPR (2019)"},{"key":"29_CR31","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: ICML (2021)"},{"key":"29_CR32","doi-asserted-by":"crossref","unstructured":"Jiang, H., Misra, I., Rohrbach, M., Learned-Miller, E., Chen, X.: In defense of grid features for visual question answering. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01028"},{"key":"29_CR33","doi-asserted-by":"crossref","unstructured":"Johnson, J., Hariharan, B., van der Maaten, L., Fei-Fei, L., Zitnick, C.L., Girshick, R.: A diagnostic dataset for compositional language and elementary visual reasoning. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.215"},{"key":"29_CR34","doi-asserted-by":"crossref","unstructured":"Xu, J., Mei, T., Yao, T., Rui, Y.: MSR-VTT: a large video description dataset for bridging video and language. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.571"},{"key":"29_CR35","doi-asserted-by":"crossref","unstructured":"Kamath, A., Singh, M., LeCun, Y., Misra, I., Synnaeve, G., Carion, N.: MDETR - modulated detection for end-to-end multi-modal understanding (2021). https:\/\/arxiv.org\/abs\/2104.12763","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"29_CR36","doi-asserted-by":"crossref","unstructured":"Kant, Y., Moudgil, A., Batra, D., Parikh, D., Agrawal, H.: Contrast and classify: training robust VQA models. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00163"},{"key":"29_CR37","unstructured":"Kim, W., Son, B., Kim, I.: ViLT: vision-and-language transformer without convolution or region supervision. In: ICML (2021)"},{"key":"29_CR38","unstructured":"Krishna, R., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations (2016). https:\/\/arxiv.org\/abs\/1602.07332"},{"key":"29_CR39","doi-asserted-by":"crossref","unstructured":"Kudo, T., Richardson, J.: SentencePiece: a simple and language independent subword tokenizer and detokenizer for neural text processing. arXiv preprint arXiv:1808.06226 (2018)","DOI":"10.18653\/v1\/D18-2012"},{"key":"29_CR40","unstructured":"Li, L.H., Yatskar, M., Yin, D., Hsieh, C.J., Chang, K.W.: VisualBERT: a simple and performant baseline for vision and language. Arxiv:https:\/\/arxiv.org\/abs\/1908.03557 (2019)"},{"key":"29_CR41","doi-asserted-by":"crossref","unstructured":"Li, L.H., et al.: Grounded language-image pre-training. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"29_CR42","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"121","DOI":"10.1007\/978-3-030-58577-8_8","volume-title":"Computer Vision \u2013 ECCV 2020","author":"X Li","year":"2020","unstructured":"Li, X., et al.: Oscar: object-semantics aligned pre-training for vision-language tasks. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12375, pp. 121\u2013137. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_8"},{"key":"29_CR43","doi-asserted-by":"crossref","unstructured":"Liao, Y., et al.: A real-time cross-modality correlation filtering method for referring expression comprehension. In: CVPR, pp. 10880\u201310889 (2020)","DOI":"10.1109\/CVPR42600.2020.01089"},{"key":"29_CR44","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Doll\u00e1r, P., Girshick, R., He, K., Hariharan, B., Belongie, S.: Feature pyramid networks for object detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2117\u20132125 (2017)","DOI":"10.1109\/CVPR.2017.106"},{"key":"29_CR45","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Goyal, P., Girshick, R., He, K., Doll\u00e1r, P.: Focal loss for dense object detection. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2980\u20132988 (2017)","DOI":"10.1109\/ICCV.2017.324"},{"key":"29_CR46","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"TY Lin","year":"2014","unstructured":"Lin, T.Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"29_CR47","doi-asserted-by":"crossref","unstructured":"Lin, X., Bertasius, G., Wang, J., Chang, S.F., Parikh, D.: Vx2text: end-to-end learning of video-based text generation from multimodal inputs. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00693"},{"key":"29_CR48","doi-asserted-by":"crossref","unstructured":"Liu, C., Lin, Z., Shen, X., Yang, J., Lu, X., Yuille, A.: Recurrent multimodal interaction for referring image segmentation. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.143"},{"key":"29_CR49","doi-asserted-by":"crossref","unstructured":"Liu, D., Zhang, H., Wu, F., Zha, Z.J.: Learning to assemble neural module tree networks for visual grounding. In: ICCV, pp. 4673\u20134682 (2019)","DOI":"10.1109\/ICCV.2019.00477"},{"key":"29_CR50","doi-asserted-by":"crossref","unstructured":"Liu, X., Wang, Z., Shao, J., Wang, X., Li, H.: Improving referring expression grounding with cross-modal attention-guided erasing. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1950\u20131959 (2019)","DOI":"10.1109\/CVPR.2019.00205"},{"key":"29_CR51","doi-asserted-by":"crossref","unstructured":"Liu, Z., Stent, S., Li, J., Gideon, J., Han, S.: LocTex: learning data-efficient visual representations from localized textual supervision. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00217"},{"key":"29_CR52","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: VilBERT: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In: CVPR (2019)"},{"key":"29_CR53","doi-asserted-by":"crossref","unstructured":"Lu, J., Goswami, V., Rohrbach, M., Parikh, D., Lee, S.: 12-in-1: multi-task vision and language representation learning. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01045"},{"key":"29_CR54","doi-asserted-by":"crossref","unstructured":"Luo, G., et al.: Multi-task collaborative network for joint referring expression comprehension and segmentation. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10031\u201310040 (2020)","DOI":"10.1109\/CVPR42600.2020.01005"},{"key":"29_CR55","doi-asserted-by":"crossref","unstructured":"Mao, J., Huang, J., Toshev, A., Camburu, O., Yuille, A., Murphy, K.: Generation and comprehension of unambiguous object descriptions. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.9"},{"key":"29_CR56","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"656","DOI":"10.1007\/978-3-030-01252-6_39","volume-title":"Computer Vision \u2013 ECCV 2018","author":"E Margffoy-Tuay","year":"2018","unstructured":"Margffoy-Tuay, E., P\u00e9rez, J.C., Botero, E., Arbel\u00e1ez, P.: Dynamic multimodal instance segmentation guided by natural language queries. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11215, pp. 656\u2013672. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01252-6_39"},{"key":"29_CR57","unstructured":"Ordonez, V., Kulkarni, G., Berg, T.L.: Im2text: describing images using 1 million captioned photographs. In: NeurIPS (2011)"},{"key":"29_CR58","unstructured":"Papageorgiou, C., Oren, M., Poggio, T.: A general framework for object detection. In: ICCV (1998)"},{"key":"29_CR59","doi-asserted-by":"crossref","unstructured":"Peng, J., Bu, X., Sun, M., Zhang, Z., Tan, T., Yan, J.: Large-scale object detection in the wild from imbalanced multi-labels. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00973"},{"key":"29_CR60","unstructured":"Plummer, B.A., Shih, K.J., Li, Y., Xu, K., Lazebnik, S., Sclaroff, S., Saenko, K.: Revisiting image-language networks for open-ended phrase detection. TPAMI (2020)"},{"key":"29_CR61","doi-asserted-by":"crossref","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., Caicedo, J.C., Hockenmaier, J., Lazebnik, S.: Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2641\u20132649 (2015)","DOI":"10.1109\/ICCV.2015.303"},{"key":"29_CR62","doi-asserted-by":"crossref","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., Caicedo, J.C., Hockenmaier, J., Lazebnik, S.: Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. Int. J. Comput. Vis. (2017)","DOI":"10.1007\/s11263-016-0965-7"},{"key":"29_CR63","first-page":"4426","volume":"23","author":"Y Qiao","year":"2020","unstructured":"Qiao, Y., Deng, C., Wu, Q.: Referring expression comprehension: a survey of methods and datasets. IEEE TMM 23, 4426\u20134440 (2020)","journal-title":"IEEE TMM"},{"key":"29_CR64","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: ICML (2021)"},{"key":"29_CR65","first-page":"1","volume":"21","author":"C Raffel","year":"2020","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res. (JMLR) 21, 1\u201367 (2020)","journal-title":"J. Mach. Learn. Res. (JMLR)"},{"key":"29_CR66","doi-asserted-by":"crossref","unstructured":"Redmon, J., Divvala, S., Girshick, R., Farhadi, A.: You only look once: unified, real-time object detection. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.91"},{"key":"29_CR67","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. In: Advances in Neural Information Processing Systems (2015)"},{"key":"29_CR68","doi-asserted-by":"crossref","unstructured":"Hu, R., Xu, H., Rohrbach, M., Feng, J., Saenko, K., Darrell, T.: Natural language object retrieval. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.493"},{"key":"29_CR69","unstructured":"Rowley, H., Baluja, S., Kanade, T.: Human face detection in visual scenes. In: Advances in Neural Information Processing Systems (1995)"},{"key":"29_CR70","doi-asserted-by":"crossref","unstructured":"Kazemzadeh, S., Ordonez, V., Matten, M., Berg, T.: ReferitGame: referring to objects in photographs of natural scenes. In: EMNLP (2014)","DOI":"10.3115\/v1\/D14-1086"},{"key":"29_CR71","doi-asserted-by":"crossref","unstructured":"Shao, S., et al.: Objects365: a large-scale, high-quality dataset for object detection. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00852"},{"key":"29_CR72","doi-asserted-by":"crossref","unstructured":"Srinivasan, K., Raman, K., Chen, J., Bendersky, M., Najork, M.: WIT: Wikipedia-based image text dataset for multimodal multilingual machine learning. arXiv:2103.01913 (2021)","DOI":"10.1145\/3404835.3463257"},{"key":"29_CR73","doi-asserted-by":"crossref","unstructured":"Suhr, A., Lewis, M., Yeh, J., Artzi, Y.: A corpus of natural language for visual reasoning. In: Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (2017)","DOI":"10.18653\/v1\/P17-2034"},{"key":"29_CR74","doi-asserted-by":"crossref","unstructured":"Tan, H., Bansal, M.: LXMERT: learning cross-modality encoder representations from transformers. In: EMNLP (2019)","DOI":"10.18653\/v1\/D19-1514"},{"key":"29_CR75","doi-asserted-by":"crossref","unstructured":"Vaillant, R., Monrocq, C., Cun, Y.L.: An original approach for the localization of objects in images. In: IEEE Proceedings of the Visual Image Signal Processing (1994)","DOI":"10.1049\/ip-vis:19941301"},{"key":"29_CR76","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"792","DOI":"10.1007\/978-3-319-46493-0_48","volume-title":"Computer Vision \u2013 ECCV 2016","author":"VK Nagaraja","year":"2016","unstructured":"Nagaraja, V.K., Morariu, V.I., Davis, L.S.: Modeling context between objects for referring expression understanding. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9908, pp. 792\u2013807. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46493-0_48"},{"key":"29_CR77","unstructured":"Vaswani, A., et al.: Attention is all you need. In: NeurIPS (2017)"},{"key":"29_CR78","doi-asserted-by":"publisher","first-page":"137","DOI":"10.1023\/B:VISI.0000013087.49260.fb","volume":"57","author":"P Viola","year":"2001","unstructured":"Viola, P., Jones, M.: Robust real-time object detection. Int. J. Comput. Vis. 57, 137\u2013154 (2001). https:\/\/doi.org\/10.1023\/B:VISI.0000013087.49260.fb","journal-title":"Int. J. Comput. Vis."},{"key":"29_CR79","doi-asserted-by":"publisher","first-page":"394","DOI":"10.1109\/TPAMI.2018.2797921","volume":"41","author":"L Wang","year":"2018","unstructured":"Wang, L., Li, Y., Huang, J., Lazebnik, S.: Learning two-branch neural networks for image-text matching tasks. IEEE Trans. Pattern Anal. Mach. Intell. 41, 394\u2013407 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"29_CR80","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"696","DOI":"10.1007\/978-3-319-46484-8_42","volume-title":"Computer Vision \u2013 ECCV 2016","author":"M Wang","year":"2016","unstructured":"Wang, M., Azab, M., Kojima, N., Mihalcea, R., Deng, J.: Structured matching for phrase localization. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9912, pp. 696\u2013711. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46484-8_42"},{"key":"29_CR81","doi-asserted-by":"crossref","unstructured":"Wang, P., Wu, Q., Cao, J., Shen, C., Gao, L., Hengel, A.V.D.: Neighbourhood watch: referring expression comprehension via language-guided graph attention networks. In: CVPR, pp. 1960\u20131968 (2019)","DOI":"10.1109\/CVPR.2019.00206"},{"key":"29_CR82","doi-asserted-by":"publisher","first-page":"2251","DOI":"10.1109\/TPAMI.2018.2857768","volume":"41","author":"Y Xian","year":"2018","unstructured":"Xian, Y., Lampert, C.H., Schiele, B., Akata, Z.: Zero-shot learning - a comprehensive evaluation of the good, the bad and the ugly. TPAMI 41, 2251\u20132265 (2018)","journal-title":"TPAMI"},{"key":"29_CR83","unstructured":"Xie, N., Lai, F., Doran, D., Kadav, A.: Visual entailment: a novel task for fine-grained image understanding (2019). https:\/\/arxiv.org\/abs\/1901.06706"},{"key":"29_CR84","doi-asserted-by":"crossref","unstructured":"Xu, H., et al.: E2E-VLP: end-to-end vision-language pre-training enhanced by visual learning. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (2021)","DOI":"10.18653\/v1\/2021.acl-long.42"},{"key":"29_CR85","doi-asserted-by":"crossref","unstructured":"Yang, S., Li, G., Yu, Y.: Dynamic graph attention for referring expression comprehension. In: ICCV, pp. 4644\u20134653 (2019)","DOI":"10.1109\/ICCV.2019.00474"},{"key":"29_CR86","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"387","DOI":"10.1007\/978-3-030-58568-6_23","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Z Yang","year":"2020","unstructured":"Yang, Z., Chen, T., Wang, L., Luo, J.: Improving one-stage visual grounding by recursive sub-query construction. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12359, pp. 387\u2013404. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58568-6_23"},{"key":"29_CR87","doi-asserted-by":"crossref","unstructured":"Yang, Z., Gong, B., Wang, L., Huang, W., Yu, D., Luo, J.: A fast and accurate one-stage approach to visual grounding. In: ICCV, pp. 4683\u20134693 (2019)","DOI":"10.1109\/ICCV.2019.00478"},{"key":"29_CR88","doi-asserted-by":"crossref","unstructured":"Yu, F., et al.: ERNIE-VIL: knowledge enhanced vision-language representations through scene graph. In: AAAI (2021)","DOI":"10.1609\/aaai.v35i4.16431"},{"key":"29_CR89","doi-asserted-by":"crossref","unstructured":"Yu, L., et al.: MAttNet: modular attention network for referring expression comprehension. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00142"},{"key":"29_CR90","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"69","DOI":"10.1007\/978-3-319-46475-6_5","volume-title":"Computer Vision \u2013 ECCV 2016","author":"L Yu","year":"2016","unstructured":"Yu, L., Poirson, P., Yang, S., Berg, A.C., Berg, T.L.: Modeling context in referring expressions. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9906, pp. 69\u201385. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46475-6_5"},{"key":"29_CR91","doi-asserted-by":"crossref","unstructured":"Zellers, R., Bisk, Y., Farhadi, A., Choi, Y.: From recognition to cognition: visual commonsense reasoning. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00688"},{"key":"29_CR92","doi-asserted-by":"crossref","unstructured":"Zhang, H., Niu, Y., Chang, S.F.: Grounding referring expressions in images by variational context. In: CVPR, pp. 4158\u20134166 (2018)","DOI":"10.1109\/CVPR.2018.00437"},{"key":"29_CR93","doi-asserted-by":"crossref","unstructured":"Zhang, P., et al.: VinVL: revisiting visual representations in vision-language models. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"29_CR94","doi-asserted-by":"crossref","unstructured":"Zhao, F., Li, J., Zhao, J., Feng, J.: Weakly supervised phrase localization with multi-scale anchored transformer network. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00597"},{"key":"29_CR95","doi-asserted-by":"crossref","unstructured":"Zhou, L., Palangi, H., Zhang, L., Hu, H., Corso, J.J., Gao, J.: Unified vision-language pre-training for image captioning and VQA. In: AAAI (2020)","DOI":"10.1609\/aaai.v34i07.7005"},{"key":"29_CR96","doi-asserted-by":"publisher","first-page":"998","DOI":"10.1109\/TCSVT.2019.2899569","volume":"30","author":"P Zhu","year":"2018","unstructured":"Zhu, P., Wang, H., Saligrama, V.: Zero-shot detection. IEEE Trans. Circ. Syst. Video Technol. 30, 998\u20131010 (2018)","journal-title":"IEEE Trans. Circ. Syst. Video Technol."},{"key":"29_CR97","doi-asserted-by":"crossref","unstructured":"Zhuang, B., Wu, Q., Shen, C., Reid, I., van den Hengel, A.: Parallel attention: a unified framework for visual object discovery through dialogs and queries. In: CVPR. pp. 4252\u20134261 (2018)","DOI":"10.1109\/CVPR.2018.00447"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-20059-5_29","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,28]],"date-time":"2022-10-28T16:11:33Z","timestamp":1666973493000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-20059-5_29"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031200588","9783031200595"],"references-count":97,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-20059-5_29","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"29 October 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}