{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T19:10:56Z","timestamp":1775243456358,"version":"3.50.1"},"publisher-location":"Cham","reference-count":64,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030012458","type":"print"},{"value":"9783030012465","type":"electronic"}],"license":[{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018]]},"DOI":"10.1007\/978-3-030-01246-5_21","type":"book-chapter","created":{"date-parts":[[2018,10,5]],"date-time":"2018-10-05T16:14:56Z","timestamp":1538756096000},"page":"346-363","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":178,"title":["Factorizable Net: An Efficient Subgraph-Based Framework for Scene Graph Generation"],"prefix":"10.1007","author":[{"given":"Yikang","family":"Li","sequence":"first","affiliation":[]},{"given":"Wanli","family":"Ouyang","sequence":"additional","affiliation":[]},{"given":"Bolei","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Jianping","family":"Shi","sequence":"additional","affiliation":[]},{"given":"Chao","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Xiaogang","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2018,10,6]]},"reference":[{"key":"21_CR1","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"401","DOI":"10.1007\/978-3-319-10593-2_27","volume-title":"Computer Vision \u2013 ECCV 2014","author":"S Antol","year":"2014","unstructured":"Antol, S., Zitnick, C.L., Parikh, D.: Zero-shot learning via visual abstraction. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014, Part IV. LNCS, vol. 8692, pp. 401\u2013416. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10593-2_27"},{"key":"21_CR2","doi-asserted-by":"crossref","unstructured":"Berg, A.C., et al.: Understanding and predicting importance in images. In: CVPR (2012)","DOI":"10.1109\/CVPR.2012.6248100"},{"key":"21_CR3","doi-asserted-by":"crossref","unstructured":"Chang, A., Savva, M., Manning, C.: Semantic parsing for text to 3D scene generation. In: ACL (2014)","DOI":"10.3115\/v1\/W14-2404"},{"key":"21_CR4","doi-asserted-by":"crossref","unstructured":"Choi, M.J., Lim, J.J., Torralba, A., Willsky, A.S.: Exploiting hierarchical context on a large database of object categories. In: CVPR (2010)","DOI":"10.1109\/CVPR.2010.5540221"},{"key":"21_CR5","doi-asserted-by":"crossref","unstructured":"Choi, W., Chao, Y.W., Pantofaru, C., Savarese, S.: Understanding indoor scenes using 3D geometric phrases. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 33\u201340 (2013)","DOI":"10.1109\/CVPR.2013.12"},{"key":"21_CR6","doi-asserted-by":"crossref","unstructured":"Dai, B., Zhang, Y., Lin, D.: Detecting visual relationships with deep relational networks. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.352"},{"key":"21_CR7","doi-asserted-by":"crossref","unstructured":"Das, P., Xu, C., Doell, R.F., Corso, J.J.: A thousand frames in just a few words: lingual description of videos through latent topics and sparse object stitching. In: CVPR (2013)","DOI":"10.1109\/CVPR.2013.340"},{"key":"21_CR8","doi-asserted-by":"crossref","unstructured":"Divvala, S.K., Farhadi, A., Guestrin, C.: Learning everything about anything: webly-supervised visual concept learning. In: CVPR (2014)","DOI":"10.1109\/CVPR.2014.412"},{"key":"21_CR9","doi-asserted-by":"crossref","unstructured":"Elhoseiny, M., Cohen, S., Chang, W., Price, B.L., Elgammal, A.M.: Sherlock: scalable fact learning in images. In: AAAI (2017)","DOI":"10.1609\/aaai.v31i1.11214"},{"key":"21_CR10","doi-asserted-by":"crossref","unstructured":"Elliott, D., Keller, F.: Image description using visual dependency representations. In: EMNLP (2013)","DOI":"10.18653\/v1\/D13-1128"},{"key":"21_CR11","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"15","DOI":"10.1007\/978-3-642-15561-1_2","volume-title":"Computer Vision \u2013 ECCV 2010","author":"A Farhadi","year":"2010","unstructured":"Farhadi, A., et al.: Every picture tells a story: generating sentences from images. In: Daniilidis, K., Maragos, P., Paragios, N. (eds.) ECCV 2010, Part IV. LNCS, vol. 6314, pp. 15\u201329. Springer, Heidelberg (2010). https:\/\/doi.org\/10.1007\/978-3-642-15561-1_2"},{"key":"21_CR12","doi-asserted-by":"crossref","unstructured":"Fidler, S., Leonardis, A.: Towards scalable representations of object categories: learning a hierarchy of parts. In: CVPR (2007)","DOI":"10.1109\/CVPR.2007.383269"},{"key":"21_CR13","doi-asserted-by":"crossref","unstructured":"Galleguillos, C., Belongie, S.: Context based object categorization: a critical survey. In: CVIU (2010)","DOI":"10.1016\/j.cviu.2010.02.004"},{"key":"21_CR14","doi-asserted-by":"crossref","unstructured":"Galleguillos, C., Rabinovich, A., Belongie, S.: Object categorization using co-occurrence, location and appearance. In: CVPR (2008)","DOI":"10.1109\/CVPR.2008.4587799"},{"key":"21_CR15","doi-asserted-by":"crossref","unstructured":"Girshick, R.: Fast R-CNN. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.169"},{"key":"21_CR16","doi-asserted-by":"crossref","unstructured":"Gkioxari, G., Girshick, R., Malik, J.: Contextual action recognition with R* CNN. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.129"},{"key":"21_CR17","doi-asserted-by":"publisher","first-page":"210","DOI":"10.1007\/s11263-013-0658-4","volume":"106","author":"Y Gong","year":"2014","unstructured":"Gong, Y., Ke, Q., Isard, M., Lazebnik, S.: A multi-view embedding space for modeling internet images, tags, and their semantics. IJCV 106, 210\u2013233 (2014)","journal-title":"IJCV"},{"key":"21_CR18","doi-asserted-by":"publisher","first-page":"300","DOI":"10.1007\/s11263-008-0140-x","volume":"80","author":"S Gould","year":"2008","unstructured":"Gould, S., Rodgers, J., Cohen, D., Elidan, G., Koller, D.: Multi-class segmentation with relative location prior. IJCV 80, 300\u2013316 (2008)","journal-title":"IJCV"},{"key":"21_CR19","doi-asserted-by":"crossref","unstructured":"Guadarrama, S., et al.: Youtube2text: recognizing and describing arbitrary activities using semantic hierarchies and zero-shot recognition. In: ICCV (2013)","DOI":"10.1109\/ICCV.2013.337"},{"key":"21_CR20","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"16","DOI":"10.1007\/978-3-540-88682-2_3","volume-title":"Computer Vision \u2013 ECCV 2008","author":"A Gupta","year":"2008","unstructured":"Gupta, A., Davis, L.S.: Beyond nouns: exploiting prepositions and comparative adjectives for learning visual classifiers. In: Forsyth, D., Torr, P., Zisserman, A. (eds.) ECCV 2008, Part I. LNCS, vol. 5302, pp. 16\u201329. Springer, Heidelberg (2008). https:\/\/doi.org\/10.1007\/978-3-540-88682-2_3"},{"key":"21_CR21","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., Girshick, R.: Mask R-CNN. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.322"},{"key":"21_CR22","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition (2015). arXiv preprint: arXiv:1512.03385","DOI":"10.1109\/CVPR.2016.90"},{"key":"21_CR23","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1007\/s11263-008-0137-5","volume":"80","author":"D Hoiem","year":"2008","unstructured":"Hoiem, D., Efros, A.A., Hebert, M.: Putting objects in perspective. IJCV 80, 3\u201315 (2008)","journal-title":"IJCV"},{"key":"21_CR24","doi-asserted-by":"crossref","unstructured":"Izadinia, H., Sadeghi, F., Farhadi, A.: Incorporating scene context and object layout into appearance modeling. In: CVPR (2014)","DOI":"10.1109\/CVPR.2014.37"},{"key":"21_CR25","doi-asserted-by":"crossref","unstructured":"Johnson, J., Karpathy, A., Fei-Fei, L.: Densecap: fully convolutional localization networks for dense captioning (2015). arXiv preprint: arXiv:1511.07571","DOI":"10.1109\/CVPR.2016.494"},{"key":"21_CR26","doi-asserted-by":"crossref","unstructured":"Johnson, J., et al.: Image retrieval using scene graphs. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298990"},{"key":"21_CR27","unstructured":"Karpathy, A., Joulin, A., Fei-Fei, L.F.: Deep fragment embeddings for bidirectional image sentence mapping. In: NIPS (2014)"},{"key":"21_CR28","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. IJCV 123, 32\u201373 (2017)","journal-title":"IJCV"},{"key":"21_CR29","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: Imagenet classification with deep convolutional neural networks. In: NIPS, pp. 1097\u20131105 (2012)"},{"key":"21_CR30","doi-asserted-by":"crossref","unstructured":"Kulkarni, G., et al.: Baby talk: understanding and generating image descriptions. In: CVPR (2011)","DOI":"10.1109\/CVPR.2011.5995466"},{"key":"21_CR31","doi-asserted-by":"crossref","unstructured":"Kumar, M.P., Koller, D.: Efficiently selecting regions for scene understanding. In: CVPR (2010)","DOI":"10.1109\/CVPR.2010.5540072"},{"key":"21_CR32","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"239","DOI":"10.1007\/978-3-642-15555-0_18","volume-title":"Computer Vision \u2013 ECCV 2010","author":"L Ladicky","year":"2010","unstructured":"Ladicky, L., Russell, C., Kohli, P., Torr, P.H.S.: Graph cut based inference with co-occurrence statistics. In: Daniilidis, K., Maragos, P., Paragios, N. (eds.) ECCV 2010, Part V. LNCS, vol. 6315, pp. 239\u2013253. Springer, Heidelberg (2010). https:\/\/doi.org\/10.1007\/978-3-642-15555-0_18"},{"key":"21_CR33","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: Visual question generation as dual task of visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6116\u20136124 (2018)","DOI":"10.1109\/CVPR.2018.00640"},{"key":"21_CR34","doi-asserted-by":"crossref","unstructured":"Li, Y., Ouyang, W., Wang, X., Tang, X.: ViP-CNN: visual phrase guided convolutional neural network. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.766"},{"key":"21_CR35","doi-asserted-by":"crossref","unstructured":"Li, Y., Ouyang, W., Zhou, B., Wang, K., Wang, X.: Scene graph generation from objects, phrases and region captions. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.142"},{"key":"21_CR36","unstructured":"Liao, W., Shuai, L., Rosenhahn, B., Yang, M.Y.: Natural language guided visual relationship detection (2017). arXiv preprint: arXiv:1711.06032"},{"key":"21_CR37","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"852","DOI":"10.1007\/978-3-319-46448-0_51","volume-title":"Computer Vision \u2013 ECCV 2016","author":"C Lu","year":"2016","unstructured":"Lu, C., Krishna, R., Bernstein, M., Fei-Fei, L.: Visual relationship detection with language priors. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016, Part I. LNCS, vol. 9905, pp. 852\u2013869. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_51"},{"key":"21_CR38","doi-asserted-by":"crossref","unstructured":"Lu, P., Li, H., Wei, Z., Wang, J., Wang, X.: Co-attending free-form regions and detections with multi-modal multiplicative feature embedding for visual question answering. In: AAAI (2018)","DOI":"10.1609\/aaai.v32i1.12240"},{"key":"21_CR39","doi-asserted-by":"crossref","unstructured":"Mensink, T., Gavves, E., Snoek, C.G.: Costa: co-occurrence statistics for zero-shot classification. In: CVPR (2014)","DOI":"10.1109\/CVPR.2014.313"},{"key":"21_CR40","unstructured":"Nair, V., Hinton, G.E.: Rectified linear units improve restricted Boltzmann machines. In: ICML (2010)"},{"key":"21_CR41","doi-asserted-by":"crossref","unstructured":"Peyre, J., Laptev, I., Schmid, C., Sivic, J.: Weakly-supervised learning of visual relations. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.554"},{"key":"21_CR42","doi-asserted-by":"crossref","unstructured":"Plummer, B.A., Mallya, A., Cervantes, C.M., Hockenmaier, J., Lazebnik, S.: Phrase localization and visual relationship detection with comprehensive linguistic cues. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.213"},{"key":"21_CR43","doi-asserted-by":"crossref","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., Caicedo, J.C., Hockenmaier, J., Lazebnik, S.: Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.303"},{"key":"21_CR44","doi-asserted-by":"crossref","unstructured":"Rabinovich, A., Vedaldi, A., Galleguillos, C., Wiewiora, E., Belongie, S.: Objects in context. In: ICCV (2007)","DOI":"10.1109\/ICCV.2007.4408986"},{"key":"21_CR45","doi-asserted-by":"crossref","unstructured":"Ramanathan, V., et al.: Learning semantic relationships for better action retrieval in images. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298713"},{"key":"21_CR46","doi-asserted-by":"crossref","unstructured":"Regneri, M., Rohrbach, M., Wetzel, D., Thater, S., Schiele, B., Pinkal, M.: Grounding action descriptions in videos. In: ACL (2013)","DOI":"10.1162\/tacl_a_00207"},{"key":"21_CR47","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. In: NIPS (2015)"},{"key":"21_CR48","doi-asserted-by":"crossref","unstructured":"Rohrbach, A., Rohrbach, M., Hu, R., Darrell, T., Schiele, B.: Grounding of textual phrases in images by reconstruction (2015). arXiv preprint: arXiv:1511.03745","DOI":"10.1007\/978-3-319-46448-0_49"},{"key":"21_CR49","doi-asserted-by":"crossref","unstructured":"Rohrbach, M., Qiu, W., Titov, I., Thater, S., Pinkal, M., Schiele, B.: Translating video content to natural language descriptions. In: ICCV (2013)","DOI":"10.1109\/ICCV.2013.61"},{"key":"21_CR50","unstructured":"Russell, B.C., Freeman, W.T., Efros, A.A., Sivic, J., Zisserman, A.: Using multiple segmentations to discover objects and their extent in image collections. In: CVPR (2006)"},{"key":"21_CR51","doi-asserted-by":"crossref","unstructured":"Sadeghi, M.A., Farhadi, A.: Recognition using visual phrases. In: CVPR (2011)","DOI":"10.1109\/CVPR.2011.5995711"},{"key":"21_CR52","doi-asserted-by":"crossref","unstructured":"Salakhutdinov, R., Torralba, A., Tenenbaum, J.: Learning to share visual appearance for multiclass object detection. In: CVPR (2011)","DOI":"10.1109\/CVPR.2011.5995720"},{"key":"21_CR53","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition (2014). arXiv preprint: arXiv:1409.1556"},{"key":"21_CR54","doi-asserted-by":"crossref","unstructured":"Sivic, J., Russell, B.C., Efros, A.A., Zisserman, A., Freeman, W.T.: Discovering objects and their location in images. In: ICCV (2005)","DOI":"10.1109\/ICCV.2005.77"},{"key":"21_CR55","unstructured":"Thomason, J., Venugopalan, S., Guadarrama, S., Saenko, K., Mooney, R.: Integrating language and vision to generate natural language descriptions of videos in the wild. In: COLING (2014)"},{"key":"21_CR56","unstructured":"Xiong, Y., Zhu, K., Lin, D., Tang, X.: Recognize complex events from static images by fusing deep channels. In: CVPR (2015)"},{"key":"21_CR57","doi-asserted-by":"crossref","unstructured":"Xu, D., Zhu, Y., Choy, C.B., Fei-Fei, L.: Scene graph generation by iterative message passing. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.330"},{"key":"21_CR58","unstructured":"Xu, K., et al.: Show, attend and tell: neural image caption generation with visual attention (2015). arXiv preprint: arXiv:1502.03044"},{"key":"21_CR59","doi-asserted-by":"crossref","unstructured":"Yao, B., Fei-Fei, L.: Grouplet: a structured image representation for recognizing human and object interactions. In: CVPR (2010)","DOI":"10.1109\/CVPR.2010.5540234"},{"key":"21_CR60","unstructured":"Yao, J., Fidler, S., Urtasun, R.: Describing the scene as a whole: joint object detection, scene classification and semantic segmentation. In: CVPR (2012)"},{"key":"21_CR61","doi-asserted-by":"crossref","unstructured":"Yu, R., Li, A., Morariu, V.I., Davis, L.S.: Visual relationship detection with internal and external linguistic knowledge distillation. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.121"},{"key":"21_CR62","doi-asserted-by":"crossref","unstructured":"Zhang, H., Kyaw, Z., Chang, S.F., Chua, T.S.: Visual translation embedding network for visual relation detection. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.331"},{"key":"21_CR63","doi-asserted-by":"crossref","unstructured":"Zhuang, B., Liu, L., Shen, C., Reid, I.: Towards context-aware interaction recognition. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.71"},{"key":"21_CR64","doi-asserted-by":"crossref","unstructured":"Zitnick, C.L., Parikh, D., Vanderwende, L.: Learning the visual interpretation of sentences. In: ICCV (2013)","DOI":"10.1109\/ICCV.2013.211"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2018"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-01246-5_21","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T18:36:44Z","timestamp":1775241404000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-01246-5_21"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018]]},"ISBN":["9783030012458","9783030012465"],"references-count":64,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-01246-5_21","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018]]},"assertion":[{"value":"6 October 2018","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Munich","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2018","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 September 2018","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 September 2018","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2018","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2018.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}