{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T18:36:18Z","timestamp":1775154978596,"version":"3.50.1"},"publisher-location":"Cham","reference-count":34,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783319464473","type":"print"},{"value":"9783319464480","type":"electronic"}],"license":[{"start":{"date-parts":[[2016,1,1]],"date-time":"2016-01-01T00:00:00Z","timestamp":1451606400000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2016,1,1]],"date-time":"2016-01-01T00:00:00Z","timestamp":1451606400000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2016]]},"DOI":"10.1007\/978-3-319-46448-0_7","type":"book-chapter","created":{"date-parts":[[2016,9,16]],"date-time":"2016-09-16T08:27:24Z","timestamp":1474014444000},"page":"108-124","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":266,"title":["Segmentation from Natural Language Expressions"],"prefix":"10.1007","author":[{"given":"Ronghang","family":"Hu","sequence":"first","affiliation":[]},{"given":"Marcus","family":"Rohrbach","sequence":"additional","affiliation":[]},{"given":"Trevor","family":"Darrell","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2016,9,17]]},"reference":[{"key":"7_CR1","unstructured":"Abadi, M., Agarwal, A., Barham, P., Brevdo, E., Chen, Z., Citro, C., Corrado, G.S., Davis, A., Dean, J., Devin, M., Ghemawat, S., Goodfellow, I., Harp, A., Irving, G., Isard, M., Jia, Y., Jozefowicz, R., Kaiser, L., Kudlur, M., Levenberg, J., Man\u00e9, D., Monga, R., Moore, S., Murray, D., Olah, C., Schuster, M., Shlens, J., Steiner, B., Sutskever, I., Talwar, K., Tucker, P., Vanhoucke, V., Vasudevan, V., Vi\u00e9gas, F., Vinyals, O., Warden, P., Wattenberg, M., Wicke, M., Yu, Y., Zheng, X.: TensorFlow: large-scale machine learning on heterogeneous systems. arXiv:1603.04467 (2016)"},{"key":"7_CR2","doi-asserted-by":"crossref","unstructured":"Andreas, J., Rohrbach, M., Darrell, T., Klein, D.: Neural module networks. In: Proceedings of the IEEE International Conference on Computer Vision (2016)","DOI":"10.1109\/CVPR.2016.12"},{"key":"7_CR3","doi-asserted-by":"crossref","unstructured":"Arbel\u00e1ez, P., Pont-Tuset, J., Barron, J., Marques, F., Malik, J.: Multiscale combinatorial grouping. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 328\u2013335 (2014)","DOI":"10.1109\/CVPR.2014.49"},{"key":"7_CR4","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"430","DOI":"10.1007\/978-3-642-33786-4_32","volume-title":"Computer Vision \u2013 ECCV 2012","author":"J Carreira","year":"2012","unstructured":"Carreira, J., Caseiro, R., Batista, J., Sminchisescu, C.: Semantic segmentation with second-order pooling. In: Fitzgibbon, A., Lazebnik, S., Perona, P., Sato, Y., Schmid, C. (eds.) ECCV 2012. LNCS, vol. 7578, pp. 430\u2013443. Springer, Heidelberg (2012). doi: 10.1007\/978-3-642-33786-4_32"},{"issue":"7","key":"7_CR5","doi-asserted-by":"publisher","first-page":"1312","DOI":"10.1109\/TPAMI.2011.231","volume":"34","author":"J Carreira","year":"2012","unstructured":"Carreira, J., Sminchisescu, C.: CPMC: automatic object segmentation using constrained parametric min-cuts. IEEE Trans. Pattern Anal. Mach. Intell. 34(7), 1312\u20131328 (2012)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"7_CR6","unstructured":"Chen, L.C., Papandreou, G., Kokkinos, I., Murphy, K., Yuille, A.L.: Semantic image segmentation with deep convolutional nets and fully connected CRFs. In: Proceedings of the International Conference on Learning Representations (2015)"},{"key":"7_CR7","doi-asserted-by":"crossref","unstructured":"Cho, K., van Merri\u00ebnboer, B., Bahdanau, D., Bengio, Y.: On the properties of neural machine translation: encoder-decoder approaches. In: Syntax, Semantics and Structure in Statistical Translation (2014)","DOI":"10.3115\/v1\/W14-4012"},{"key":"7_CR8","doi-asserted-by":"crossref","unstructured":"Donahue, J., Anne Hendricks, L., Guadarrama, S., Rohrbach, M., Venugopalan, S., Saenko, K., Darrell, T.: Long-term recurrent convolutional networks for visual recognition and description. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2625\u20132634 (2015)","DOI":"10.1109\/CVPR.2015.7298878"},{"issue":"4","key":"7_CR9","doi-asserted-by":"publisher","first-page":"419","DOI":"10.1016\/j.cviu.2009.03.008","volume":"114","author":"HJ Escalante","year":"2010","unstructured":"Escalante, H.J., Hern\u00e1ndez, C.A., Gonzalez, J.A., L\u00f3pez-L\u00f3pez, A., Montes, M., Morales, E.F., Sucar, L.E., Villase\u00f1or, L., Grubinger, M.: The segmented and annotated IAPR TC-12 benchmark. Comput. Vis. Image Underst. 114(4), 419\u2013428 (2010)","journal-title":"Comput. Vis. Image Underst."},{"key":"7_CR10","unstructured":"Everingham, M., Van Gool, L., Williams, C.K.I., Winn, J., Zisserman, A.: The PASCAL Visual Object Classes Challenge 2012 (VOC 2012) Results (2012). http:\/\/www.pascal-network.org\/challenges\/VOC\/voc2012\/workshop\/index.html"},{"key":"7_CR11","unstructured":"Grubinger, M., Clough, P., M\u00fcller, H., Deselaers, T.: The IAPR TC-12 benchmark: a new evaluation resource for visual information systems. In: International Workshop OntoImage, pp. 13\u201323 (2006)"},{"key":"7_CR12","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"297","DOI":"10.1007\/978-3-319-10584-0_20","volume-title":"Computer Vision \u2013 ECCV 2014","author":"B Hariharan","year":"2014","unstructured":"Hariharan, B., Arbel\u00e1ez, P., Girshick, R., Malik, J.: Simultaneous detection and segmentation. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8695, pp. 297\u2013312. Springer, Heidelberg (2014). doi: 10.1007\/978-3-319-10584-0_20"},{"issue":"8","key":"7_CR13","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"7_CR14","doi-asserted-by":"crossref","unstructured":"Hu, R., Rohrbach, M., Venugopalan, S., Darrell, T.: Utilizing large scale vision and text datasets for image segmentation from referring expressions. arXiv preprint (2016)","DOI":"10.1007\/978-3-319-46448-0_7"},{"key":"7_CR15","doi-asserted-by":"crossref","unstructured":"Hu, R., Xu, H., Rohrbach, M., Feng, J., Saenko, K., Darrell, T.: Natural language object retrieval. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2016)","DOI":"10.1109\/CVPR.2016.493"},{"key":"7_CR16","doi-asserted-by":"crossref","unstructured":"Kazemzadeh, S., Ordonez, V., Matten, M., Berg, T.L.: ReferitGame: referring to objects in photographs of natural scenes. In: Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 787\u2013798 (2014)","DOI":"10.3115\/v1\/D14-1086"},{"key":"7_CR17","doi-asserted-by":"crossref","unstructured":"Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2015)","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"7_CR18","doi-asserted-by":"crossref","unstructured":"Mao, J., Huang, J., Toshev, A., Camburu, O., Yuille, A., Murphy, K.: Generation and comprehension of unambiguous object descriptions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2016)","DOI":"10.1109\/CVPR.2016.9"},{"key":"7_CR19","unstructured":"Mao, J., Xu, W., Yang, Y., Wang, J., Huang, Z., Yuille, A.: Deep captioning with multimodal recurrent neural networks (m-RNN). In: Proceedings of the International Conference on Learning Representations (2015)"},{"key":"7_CR20","doi-asserted-by":"crossref","unstructured":"Meyers, A., Johnston, N., Rathod, V., Korattikara, A., Gorban, A., Silberman, N., Guadarrama, S., Papandreou, G., Huang, J., Murphy, K.P.: Im2Calories: towards an automated mobile vision food diary. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1233\u20131241 (2015)","DOI":"10.1109\/ICCV.2015.146"},{"key":"7_CR21","doi-asserted-by":"crossref","unstructured":"Noh, H., Hong, S., Han, B.: Learning deconvolution network for semantic segmentation. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1520\u20131528 (2015)","DOI":"10.1109\/ICCV.2015.178"},{"key":"7_CR22","doi-asserted-by":"crossref","unstructured":"Plummer, B., Wang, L., Cervantes, C., Caicedo, J., Hockenmaier, J., Lazebnik, S.: Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV) (2015)","DOI":"10.1109\/ICCV.2015.303"},{"key":"7_CR23","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"crossref","first-page":"817","DOI":"10.1007\/978-3-319-46448-0_49","volume-title":"ECCV 2016","author":"A Rohrbach","year":"2016","unstructured":"Rohrbach, A., Rohrbach, M., Hu, R., Darrell, T., Schiele, B.: Grounding of textual phrases in images by reconstruction. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 817\u2013834. Springer, Heidelberg (2016)"},{"key":"7_CR24","doi-asserted-by":"publisher","first-page":"309","DOI":"10.1145\/1015706.1015720","volume":"23","author":"C Rother","year":"2004","unstructured":"Rother, C., Kolmogorov, V., Blake, A.: GrabCut: interactive foreground extraction using iterated graph cuts. ACM Trans. Graph. (TOG) 23, 309\u2013314 (2004). ACM","journal-title":"ACM Trans. Graph. (TOG)"},{"issue":"3","key":"7_CR25","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., Deng, J., Su, H., Krause, J., Satheesh, S., Ma, S., Huang, Z., Karpathy, A., Khosla, A., Bernstein, M., et al.: Imagenet large scale visual recognition challenge. Int. J. Comput. Vis. 115(3), 211\u2013252 (2015)","journal-title":"Int. J. Comput. Vis."},{"key":"7_CR26","doi-asserted-by":"crossref","unstructured":"Sadeghi, M.A., Farhadi, A.: Recognition using visual phrases. In: 2011 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1745\u20131752. IEEE (2011)","DOI":"10.1109\/CVPR.2011.5995711"},{"key":"7_CR27","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. In: Proceedings of the International Conference on Learning Representations (2015)"},{"key":"7_CR28","unstructured":"Sutskever, I., Vinyals, O., Le, Q.V.: Sequence to sequence learning with neural networks. In: Advances in Neural Information Processing Systems, pp. 3104\u20133112 (2014)"},{"key":"7_CR29","doi-asserted-by":"crossref","unstructured":"Xu, H., Saenko, K.: Ask, attend and answer: exploring question-guided spatial attention for visual question answering. arXiv preprint arXiv:1511.05234 (2015)","DOI":"10.1007\/978-3-319-46478-7_28"},{"key":"7_CR30","unstructured":"Xu, K., Ba, J., Kiros, R., Courville, A., Salakhutdinov, R., Zemel, R., Bengio, Y.: Show, attend and tell: neural image caption generation with visual attention. In: Proceedings of the International Conference on Machine Learning (ICML) (2015)"},{"key":"7_CR31","doi-asserted-by":"crossref","unstructured":"Yang, Z., He, X., Gao, J., Deng, L., Smola, A.: Stacked attention networks for image question answering. In: Proceedings of the IEEE International Conference on Computer Vision (2016)","DOI":"10.1109\/CVPR.2016.10"},{"key":"7_CR32","unstructured":"Yu, F., Koltun, V.: Multi-scale context aggregation by dilated convolutions. In: Proceedings of the International Conference on Learning Representations (2016)"},{"key":"7_CR33","doi-asserted-by":"crossref","unstructured":"Zheng, S., Jayasumana, S., Romera-Paredes, B., Vineet, V., Su, Z., Du, D., Huang, C., Torr, P.H.: Conditional random fields as recurrent neural networks. In: Proceedings of the IEEE International Conference on Computer Vision (2015)","DOI":"10.1109\/ICCV.2015.179"},{"key":"7_CR34","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"391","DOI":"10.1007\/978-3-319-10602-1_26","volume-title":"Computer Vision \u2013 ECCV 2014","author":"CL Zitnick","year":"2014","unstructured":"Zitnick, C.L., Doll\u00e1r, P.: Edge boxes: locating object proposals from edges. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 391\u2013405. Springer, Heidelberg (2014). doi: 10.1007\/978-3-319-10602-1_26"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2016"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-46448-0_7","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,7,8]],"date-time":"2022-07-08T16:30:55Z","timestamp":1657297855000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-319-46448-0_7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016]]},"ISBN":["9783319464473","9783319464480"],"references-count":34,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-46448-0_7","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2016]]},"assertion":[{"value":"17 September 2016","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Amsterdam","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"The Netherlands","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2016","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 October 2016","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16 October 2016","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2016","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/www.eccv2016.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}