{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T08:24:23Z","timestamp":1774599863418,"version":"3.50.1"},"publisher-location":"Cham","reference-count":38,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030012571","type":"print"},{"value":"9783030012588","type":"electronic"}],"license":[{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018]]},"DOI":"10.1007\/978-3-030-01258-8_16","type":"book-chapter","created":{"date-parts":[[2018,10,5]],"date-time":"2018-10-05T20:35:31Z","timestamp":1538771731000},"page":"258-274","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":74,"title":["Conditional Image-Text Embedding Networks"],"prefix":"10.1007","author":[{"given":"Bryan A.","family":"Plummer","sequence":"first","affiliation":[]},{"given":"Paige","family":"Kordas","sequence":"additional","affiliation":[]},{"given":"M. Hadi","family":"Kiapour","sequence":"additional","affiliation":[]},{"given":"Shuai","family":"Zheng","sequence":"additional","affiliation":[]},{"given":"Robinson","family":"Piramuthu","sequence":"additional","affiliation":[]},{"given":"Svetlana","family":"Lazebnik","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2018,10,6]]},"reference":[{"key":"16_CR1","doi-asserted-by":"crossref","unstructured":"Antol, S., et al.: VQA: visual question answering. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"16_CR2","doi-asserted-by":"crossref","unstructured":"Babenko, B., Branson, S., Belongie, S.: Similarity metrics for categorization: from monolithic to category specific. In: ICCV (2009)","DOI":"10.1109\/ICCV.2009.5459264"},{"key":"16_CR3","doi-asserted-by":"crossref","unstructured":"Chen, K., Kovvuri, R., Gao, J., Nevatia, R.: MSRC: multimodal spatial regression with semantic context for phrase grounding. In: ICMR (2017)","DOI":"10.1145\/3078971.3078976"},{"key":"16_CR4","doi-asserted-by":"crossref","unstructured":"Chen, K., Kovvuri, R., Nevatia, R.: Query-guided regression network with context policy for phrase grounding. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.95"},{"key":"16_CR5","unstructured":"Everingham, M., Van Gool, L., Williams, C.K.I., Winn, J., Zisserman, A.: The PASCAL visual object classes challenge 2012 (VOC2012) results (2012). http:\/\/www.pascal-network.org\/challenges\/VOC\/voc2012\/workshop\/index.html"},{"key":"16_CR6","doi-asserted-by":"crossref","unstructured":"Fang, H., et al.: From captions to visual concepts and back. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298754"},{"key":"16_CR7","doi-asserted-by":"crossref","unstructured":"Fukui, A., Park, D.H., Yang, D., Rohrbach, A., Darrell, T., Rohrbach, M.: Multimodal compact bilinear pooling for visual question answering and visual grounding. In: EMNLP (2016)","DOI":"10.18653\/v1\/D16-1044"},{"key":"16_CR8","doi-asserted-by":"crossref","unstructured":"Girshick, R.: Fast R-CNN. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.169"},{"key":"16_CR9","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"241","DOI":"10.1007\/978-3-319-46466-4_15","volume-title":"Computer Vision \u2013 ECCV 2016","author":"A Gordo","year":"2016","unstructured":"Gordo, A., Almaz\u00e1n, J., Revaud, J., Larlus, D.: Deep image retrieval: learning global representations for image search. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9910, pp. 241\u2013257. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46466-4_15"},{"key":"16_CR10","doi-asserted-by":"crossref","unstructured":"Hu, R., Xu, H., Rohrbach, M., Feng, J., Saenko, K., Darrell, T.: Natural language object retrieval. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.493"},{"key":"16_CR11","unstructured":"Ioffe, S., Szegedy, C.: Batch normalization: accelerating deep network training by reducing internal covariate shift. In: ICML (2015)"},{"key":"16_CR12","doi-asserted-by":"crossref","unstructured":"Johnson, J., Karpathy, A., Fei-Fei, L.: DenseCap: fully convolutional localization networks for dense captioning. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.494"},{"key":"16_CR13","doi-asserted-by":"crossref","unstructured":"Johnson, J., et al.: Image retrieval using scene graphs. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298990"},{"key":"16_CR14","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Fei-Fei, L.: Deep visual-semantic alignments for generating image descriptions. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"16_CR15","doi-asserted-by":"crossref","unstructured":"Kazemzadeh, S., Ordonez, V., Matten, M., Berg, T.: ReferitGame: referring to objects in photographs of natural scenes. In: EMNLP (2014)","DOI":"10.3115\/v1\/D14-1086"},{"key":"16_CR16","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. In: International Conference for Learning Representations (2015)"},{"key":"16_CR17","doi-asserted-by":"crossref","unstructured":"Klein, B., Lev, G., Sadeh, G., Wolf, L.: Associating neural word embeddings with deep image representations using fisher vector. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7299073"},{"key":"16_CR18","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R.: Visual genome: connecting language and vision using crowdsourced dense image annotations. IJCV 123, 32\u201373 (2017)","journal-title":"IJCV"},{"key":"16_CR19","doi-asserted-by":"crossref","unstructured":"Liu, C., Mao, J., Sha, F., Yuille, A.: Attention correctness in neural image captioning. In: AAAI (2017)","DOI":"10.1609\/aaai.v31i1.11197"},{"key":"16_CR20","doi-asserted-by":"crossref","unstructured":"Liu, J., Wang, L., Yang, M.H.: Referring expression generation and comprehension via attributes. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.520"},{"key":"16_CR21","doi-asserted-by":"crossref","unstructured":"Luo, R., Shakhnarovich, G.: Comprehension-guided referring expressions. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.333"},{"key":"16_CR22","doi-asserted-by":"crossref","unstructured":"Mao, J., Huang, J., Toshev, A., Camburu, O., Yuille, A., Murphy, K.: Generation and comprehension of unambiguous object descriptions. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.9"},{"key":"16_CR23","unstructured":"Mikolov, T., Chen, K., Corrado, G., Dean, J.: Efficient estimation of word representations in vector space. arXiv:1301.3781 (2013)"},{"key":"16_CR24","doi-asserted-by":"crossref","unstructured":"Plummer, B.A., Mallya, A., Cervantes, C.M., Hockenmaier, J., Lazebnik, S.: Phrase localization and visual relationship detection with comprehensive image-language cues. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.213"},{"issue":"1","key":"16_CR25","doi-asserted-by":"publisher","first-page":"74","DOI":"10.1007\/s11263-016-0965-7","volume":"123","author":"BA Plummer","year":"2017","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., Caicedo, J.C., Hockenmaier, J., Lazebnik, S.: Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. IJCV 123(1), 74\u201393 (2017)","journal-title":"IJCV"},{"key":"16_CR26","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1007\/978-3-319-46448-0_1","volume-title":"Computer Vision \u2013 ECCV 2016","author":"F Radenovi\u0107","year":"2016","unstructured":"Radenovi\u0107, F., Tolias, G., Chum, O.: CNN image retrieval learns from BoW: unsupervised fine-tuning with hard examples. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 3\u201320. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_1"},{"key":"16_CR27","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. In: NIPS (2015)"},{"key":"16_CR28","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"817","DOI":"10.1007\/978-3-319-46448-0_49","volume-title":"Computer Vision \u2013 ECCV 2016","author":"A Rohrbach","year":"2016","unstructured":"Rohrbach, A., Rohrbach, M., Hu, R., Darrell, T., Schiele, B.: Grounding of textual phrases in images by reconstruction. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 817\u2013834. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_49"},{"key":"16_CR29","doi-asserted-by":"crossref","unstructured":"Tommasi, T., Mallya, A., Plummer, B.A., Lazebnik, S., Berg, A.C., Berg, T.L.: Solving visual madlibs with multiple cues. In: BMVC (2016)","DOI":"10.5244\/C.30.77"},{"key":"16_CR30","doi-asserted-by":"crossref","unstructured":"Veit, A., Belongie, S., Karaletsos, T.: Conditional similarity networks. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.193"},{"key":"16_CR31","doi-asserted-by":"crossref","unstructured":"Wang, L., Li, Y., Lazebnik, S.: Learning deep structure-preserving image-text embeddings. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.541"},{"key":"16_CR32","unstructured":"Wang, L., Li, Y., Lazebnik, S.: Learning two-branch neural networks for image-text matching tasks. arXiv:1704.03470 (2017)"},{"key":"16_CR33","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"696","DOI":"10.1007\/978-3-319-46484-8_42","volume-title":"Computer Vision \u2013 ECCV 2016","author":"M Wang","year":"2016","unstructured":"Wang, M., Azab, M., Kojima, N., Mihalcea, R., Deng, J.: Structured matching for phrase localization. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9912, pp. 696\u2013711. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46484-8_42"},{"key":"16_CR34","unstructured":"Xu, K., et al.: Show, attend and tell: neural image caption generation with visual attention. In: ICML (2015)"},{"key":"16_CR35","unstructured":"Yeh, R.A., Xiong, J., Hwu, W.M., Do, M.N., Schwing, A.G.: Interpretable and globally optimal prediction for textual grounding using image concepts. In: NIPS (2017)"},{"key":"16_CR36","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"69","DOI":"10.1007\/978-3-319-46475-6_5","volume-title":"Computer Vision \u2013 ECCV 2016","author":"L Yu","year":"2016","unstructured":"Yu, L., Poirson, P., Yang, S., Berg, A.C., Berg, T.L.: Modeling context in referring expressions. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9906, pp. 69\u201385. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46475-6_5"},{"key":"16_CR37","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Yuan, L., Guo, Y., He, Z., Huang, I.A., Lee, H.: Discriminative bimodal networks for visual localization and detection with natural language queries. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.122"},{"key":"16_CR38","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"391","DOI":"10.1007\/978-3-319-10602-1_26","volume-title":"Computer Vision \u2013 ECCV 2014","author":"CL Zitnick","year":"2014","unstructured":"Zitnick, C.L., Doll\u00e1r, P.: Edge boxes: locating object proposals from edges. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 391\u2013405. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_26"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2018"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-01258-8_16","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,5]],"date-time":"2022-10-05T00:38:52Z","timestamp":1664930332000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-01258-8_16"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018]]},"ISBN":["9783030012571","9783030012588"],"references-count":38,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-01258-8_16","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018]]},"assertion":[{"value":"6 October 2018","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Munich","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2018","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 September 2018","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 September 2018","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2018","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2018.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}