{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,4,9]],"date-time":"2025-04-09T07:22:29Z","timestamp":1744183349862,"version":"3.37.3"},"reference-count":69,"publisher":"Springer Science and Business Media LLC","issue":"2-4","license":[{"start":{"date-parts":[[2016,10,28]],"date-time":"2016-10-28T00:00:00Z","timestamp":1477612800000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"funder":[{"name":"ONR PECASE","award":["N00014-15-1-2291"],"award-info":[{"award-number":["N00014-15-1-2291"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2018,4]]},"DOI":"10.1007\/s11263-016-0958-6","type":"journal-article","created":{"date-parts":[[2016,10,28]],"date-time":"2016-10-28T01:06:49Z","timestamp":1477616809000},"page":"292-313","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Subjects and Their Objects: Localizing Interactees for a Person-Centric View of Importance"],"prefix":"10.1007","volume":"126","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3331-4551","authenticated-orcid":false,"given":"Chao-Yeh","family":"Chen","sequence":"first","affiliation":[]},{"given":"Kristen","family":"Grauman","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2016,10,28]]},"reference":[{"key":"958_CR1","doi-asserted-by":"crossref","unstructured":"Alexe, B., Deselaers, T., & Ferrari, V. (2010). What is an object? In Conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2010.5540226"},{"issue":"3","key":"958_CR2","doi-asserted-by":"crossref","first-page":"10","DOI":"10.1145\/1276377.1276390","volume":"26","author":"S Avidan","year":"2007","unstructured":"Avidan, S., & Shamir, A. (2007). Seam carving for content-aware image resizing. ACM Transactions on Graphics, 26(3), 10.","journal-title":"ACM Transactions on Graphics"},{"key":"958_CR3","doi-asserted-by":"crossref","unstructured":"Berg, A., Berg. T., Daume, H., Dodge, J., Goyal, A., Han, X., Mensch, A., Mitchell, M., Sood, A., Stratos, K., & Yamaguchi, K. (2012). Understanding and predicting importance in images. In Conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2012.6248100"},{"key":"958_CR4","unstructured":"Bishop, C. M. (1994). Mixture density networks. Tech. rep., Microsoft Research Cambridge."},{"key":"958_CR5","doi-asserted-by":"crossref","unstructured":"Carreira, J., & Sminchisescu, C. (2010). Constrained parametric min-cuts for automatic object segmentation. In Conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2010.5540063"},{"key":"958_CR6","doi-asserted-by":"crossref","unstructured":"Chao, Y.W., Wang, Z., He, Y., Wang, J., & Deng, J. (2015). Hico: A benchmark for recognizing human-object interactions in images. In International conference on computer vision.","DOI":"10.1109\/ICCV.2015.122"},{"key":"958_CR7","unstructured":"Chen, C. Y., & Grauman, K. (2014). Predicting the location of nteractees?in novel human-object interactions. In Asian conference on computer vision."},{"key":"958_CR8","doi-asserted-by":"crossref","unstructured":"Cristani, M., Bazzani, L., Paggettim, G., Fossati, A., Bue, A. D., Menegaz, G., & Murino, V. (2011). Social interaction discovery by statistical analysis of f-formations. In British machine vision conference.","DOI":"10.5244\/C.25.23"},{"key":"958_CR9","doi-asserted-by":"crossref","unstructured":"Dalal, N., & Triggs, B. (2005). Histograms of oriented gradients for human detection. In Conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2005.177"},{"key":"958_CR10","doi-asserted-by":"crossref","unstructured":"Damen, D., & Hogg, D. (2012). Detecting carried objects from sequences of walking pedestrians. In IEEE transactions on pattern analysis and machine intelligence.","DOI":"10.1109\/TPAMI.2011.205"},{"key":"958_CR11","doi-asserted-by":"crossref","unstructured":"Delaitre, V., Fouhey, D., Laptev, I., Sivic, J., Gupta, A., & Efros, A. (2012). Scene semantics from long-term observation of people. In European conference on computer vision.","DOI":"10.1007\/978-3-642-33783-3_21"},{"key":"958_CR12","doi-asserted-by":"crossref","unstructured":"Desai, C., & Ramanan, D. (2013). Predicting functional regions on objects. In CVPR workshop on scene analysis beyond semantics.","DOI":"10.1109\/CVPRW.2013.141"},{"key":"958_CR13","doi-asserted-by":"crossref","unstructured":"Desai, C., Ramanan, D., & Fowlkes, C. (2010). Discriminative models for static human-object interactions. In Workshopon structured models in computer vision, computervision and pattern recognition (SMiCV)","DOI":"10.1109\/CVPRW.2010.5543176"},{"key":"958_CR14","unstructured":"Devlin, J., Gupta, S., Girschick, R., Mitchell, M., & Zitnick, L. (2015). Exploring nearest neighbor approaches for image captioning. arXiv:1505.04467"},{"key":"958_CR15","doi-asserted-by":"crossref","unstructured":"Donahue, J., Hendricks, L.A., Guadarrama, S., Rohrbach, M., Venugopalan, S., Saenko, K., & Darrell, T. (2015). Long-term recurrent convolutional networks for visual recognition and description. In Conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"958_CR16","doi-asserted-by":"crossref","unstructured":"Endres, I., & Hoiem, D. (2014). Category-independent object proposals with diverse ranking. In IEEE transactions on pattern analysis and machine intelligence.","DOI":"10.1109\/TPAMI.2013.122"},{"key":"958_CR17","doi-asserted-by":"crossref","first-page":"303","DOI":"10.1007\/s11263-009-0275-4","volume":"88","author":"M Everingham","year":"2010","unstructured":"Everingham, M., Gool, L. V., Williams, C. K. I., Winn, J., & Zisserman, A. (2010). The PASCAL visual object classes (VOC) challenge. IJCV, 88, 303\u2013338.","journal-title":"IJCV"},{"key":"958_CR18","doi-asserted-by":"crossref","unstructured":"Fang, H., Gupta, S., Iandola, F., Srivastava, R., Deng, L., Dollar, P., Gao, J., He, X., Mitchell, M., Platt, J., Zitnick, C., & Zweig, G. (2015). From captions to visual concepts and back. In Conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2015.7298754"},{"key":"958_CR19","doi-asserted-by":"crossref","unstructured":"Farhadi, A., Hejrati, M., Sadeghi, M.A., Young, P., Rashtchian, C., Hockenmaier, J., & Forsyth, D. (2010). Every picture tells a story: Generating sentences from images. In European conference on computer vision.","DOI":"10.1007\/978-3-642-15561-1_2"},{"key":"958_CR20","unstructured":"Farhadi, A., & Sadeghi, M. (2011). Recognition using visual phrases. In Conference on computer vision and pattern recognition."},{"key":"958_CR21","doi-asserted-by":"crossref","unstructured":"Fathi, A., Hodgins, J., & Rehg, J. (2012). Social interactions: A first-person perspective. In Conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2012.6247805"},{"issue":"9","key":"958_CR22","doi-asserted-by":"crossref","first-page":"1627","DOI":"10.1109\/TPAMI.2009.167","volume":"32","author":"PF Felzenszwalb","year":"2010","unstructured":"Felzenszwalb, P. F., Girshick, R. B., McAllester, D., & Ramanan, D. (2010). Object detection with discriminatively trained part based models. PAMI, 32(9), 1627\u20131645.","journal-title":"PAMI"},{"key":"958_CR23","doi-asserted-by":"crossref","unstructured":"Fouhey, D. F., Delaitre, V., Gupta, A., Efros, A. A., Laptev, I., & Sivic, J. (2014). People watching: Human actions as a cue for single view geometry. In International journal of computer vision.","DOI":"10.1007\/s11263-014-0710-z"},{"key":"958_CR24","doi-asserted-by":"crossref","unstructured":"Guadarrama, S., Krishnamoorthy, N., Malkarnenkar, G., Venugopalan, S., Mooney, R., Darrell, T., & Saenko, K. (2013). Youtube2text: Recognizing and describing arbitrary activities using semantic hierarchies and zero-shot recognition. In International conference on computer vision.","DOI":"10.1109\/ICCV.2013.337"},{"key":"958_CR25","doi-asserted-by":"crossref","unstructured":"Gupta, A., Kembhavi, A., & Davis, L. (2009). Observing human-object interactions: using spatial and functional compatibility for recognition. PAMI, 31(10)","DOI":"10.1109\/TPAMI.2009.83"},{"key":"958_CR26","doi-asserted-by":"crossref","unstructured":"Gupta, A., Satkin, S., Efros, A., & Hebert, M. (2011). From 3D scene geometry to human workspace. In Conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2011.5995448"},{"key":"958_CR27","unstructured":"Gupta, S., & Malik, J. (2015). Visual semantic role labeling. arXiv:1505.04474"},{"key":"958_CR28","doi-asserted-by":"crossref","unstructured":"Haritaoglu, I., Harwood, D., & Davis, L. (2000). W4: Real-time surveillance of people and their activities. In IEEE transactions on pattern analysis and machine intelligence.","DOI":"10.1109\/34.868683"},{"key":"958_CR29","doi-asserted-by":"crossref","unstructured":"Hou, X., & Zhang, L. (2007). Saliency detection: A spectral residual approach. In Conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2007.383267"},{"key":"958_CR30","doi-asserted-by":"crossref","unstructured":"Hwang, S. J., & Grauman, K. (2010). Accounting for the relative importance of objects in image retrieval. In British machine vision conference.","DOI":"10.5244\/C.24.58"},{"key":"958_CR31","doi-asserted-by":"crossref","unstructured":"Ikizler-Cinbis, N., & Sclaroff, S. (2010). Object, scene and actions: Combining multiple features for human action recognition. In European conference on computer vision.","DOI":"10.1007\/978-3-642-15549-9_36"},{"key":"958_CR32","doi-asserted-by":"crossref","unstructured":"Itti, L., Koch, C., & Niebur, E. (1998). A model of saliency-based visual attention for rapid scene analysis. TPAMI, 20(11),","DOI":"10.1109\/34.730558"},{"key":"958_CR33","doi-asserted-by":"crossref","unstructured":"Jia, Y., Shelhamer, E., Donahue, J., Karayev, S., Long, J., Girshick, R., Guadarrama, S., & Darrell, T. (2014). Caffe: Convolutional architecture for fast feature embedding. In Proceedings of the acm international conference on multimedia.","DOI":"10.1145\/2647868.2654889"},{"key":"958_CR34","doi-asserted-by":"crossref","unstructured":"Karpathy, A., & Fei-Fei, L. (2015). Deep visual-semantic alignments for generating image descriptions. In Conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"958_CR35","doi-asserted-by":"crossref","unstructured":"Kjellstrom, H., Romero, J., Mercado, D. M., & Kragic, D. (2008). Simultaneous visual recognition of manipulation actions and manipulated objects. In European conference on computer vision.","DOI":"10.1007\/978-3-540-88688-4_25"},{"key":"958_CR36","doi-asserted-by":"crossref","unstructured":"Koppula, H., & Saxena, A. (2013). Anticipating human activities using object affordances for reactive robotic response. In RSS.","DOI":"10.15607\/RSS.2013.IX.006"},{"key":"958_CR37","unstructured":"Krizhevsky, A., Sutskever, I., & Hinton, G.E. (2012). Imagenet classification with deep convolutional neural networks. In Neural information processing systems conference."},{"key":"958_CR38","doi-asserted-by":"crossref","unstructured":"Kulkarni, G., Premraj, V., Dhar, S., Li, S., Choi, Y., Berg, A., & Berg, T. (2011). Baby talk: Understanding and generating image descriptions. In Conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2011.5995466"},{"key":"958_CR39","unstructured":"Kuznetsova, P., Ordonez, V., Berg, A. C., Berg, T. L., & Choi, Y. (2012). Collective generation of natural image descriptions. In Association for computational linguistics."},{"key":"958_CR40","doi-asserted-by":"crossref","unstructured":"Le, D., Bernardi, R., & Uijlings, J. (2014). TUHOI: Trento universal human object interaction dataset. In Vision and language workshop at COLING.","DOI":"10.3115\/v1\/W14-5403"},{"key":"958_CR41","doi-asserted-by":"crossref","unstructured":"Lee, Y. J., Kim, J., & Grauman, K. (2011). Key-segments for video object segmentation. In International conference on computer vision.","DOI":"10.1109\/ICCV.2011.6126471"},{"key":"958_CR42","doi-asserted-by":"crossref","unstructured":"Lin, T., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., & Zitnick, C. L. (2014). Microsoft COCO: Common objects in context. In European conference on computer vision.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"958_CR43","doi-asserted-by":"crossref","unstructured":"Liu, T., Sun, J., Zheng, N., Tang, X., & Shum, H. (2007). Learning to detect a salient object. In Conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2007.383047"},{"key":"958_CR44","doi-asserted-by":"crossref","unstructured":"Maji, S., Bourdev, L., & Malik, J. (2011). Action recognition from a distributed representation of pose and appearance. In Conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2011.5995631"},{"key":"958_CR45","unstructured":"Marin-Jimenez, M., Zisserman, A., & Ferrari, V. (2011). Here\u2019s looking at you kid. detection people looking at each other in videos. In British machine vision conference."},{"key":"958_CR46","doi-asserted-by":"crossref","unstructured":"Ordonez, V., Deng, J., Choi, Y., Berg, A., & Berg, T. (2013). From large scale image categorization to entry-level categories. In International conference on computer vision.","DOI":"10.1109\/ICCV.2013.344"},{"key":"958_CR47","unstructured":"Ordonez, V., Kulkarni, G., & Berg, T.L. (2011). Im2text: Describing images using 1 million captioned photographs. In Neural information processing systems conference."},{"key":"958_CR48","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, & W. J. (2002). Bleu: A method for automatic evaluation of machine translation. In Association for computational linguistics."},{"key":"958_CR49","unstructured":"Park, H., & Shi, J. (2015). Social saliency. In Conference on computer vision and pattern recognition."},{"key":"958_CR50","doi-asserted-by":"crossref","unstructured":"Peursum, P., West, G., & Venkatesh, S. (2005). Combining image regions and human activity for indirect object recognition in indoor wide-angle views. In International conference on computer vision.","DOI":"10.1109\/ICCV.2005.57"},{"key":"958_CR51","doi-asserted-by":"crossref","unstructured":"Pirsiavash, H., Vondrick, C., & Torralba, A. (2014). Inferring the why in images. In Workshop on vision meets cognition at CVPR.","DOI":"10.21236\/ADA612444"},{"issue":"3","key":"958_CR52","doi-asserted-by":"crossref","first-page":"601","DOI":"10.1109\/TPAMI.2011.158","volume":"34","author":"A Prest","year":"2012","unstructured":"Prest, A., Schmid, C., & Ferrari, V. (2012). Weakly supervised learning of interactions between humans and objects. PAMI, 34(3), 601\u2013614.","journal-title":"PAMI"},{"key":"958_CR53","unstructured":"Recasens, A., Khosla, A., Vondrick, C., & Torralba, A. (2015). Where are they looking? In Neural information processing systems conference."},{"key":"958_CR54","doi-asserted-by":"crossref","unstructured":"Ronchi, M. R., & Perona, P. (2015). Describing common human visual actions in images. In British machine vision conference.","DOI":"10.5244\/C.29.52"},{"key":"958_CR55","doi-asserted-by":"crossref","unstructured":"Sadovnik, A., Chiu, Y. I., Snavely, N., Edelman, S., & Chen, T. (2012). Image description with a goal: Building efficient discriminating expressions for images. In Conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2012.6248003"},{"key":"958_CR56","doi-asserted-by":"crossref","unstructured":"Spain, M., & Perona, P. (2008). Some objects are more equal than others: Measuring and predicting importance. In European conference on computer vision.","DOI":"10.1007\/978-3-540-88682-2_40"},{"issue":"1","key":"958_CR57","doi-asserted-by":"crossref","first-page":"59","DOI":"10.1007\/s11263-010-0376-0","volume":"91","author":"M Spain","year":"2011","unstructured":"Spain, M., & Perona, P. (2011). Measuring and predicting object importance. International Journal of Computer Vision, 91(1), 59\u201376.","journal-title":"International Journal of Computer Vision"},{"key":"958_CR58","doi-asserted-by":"crossref","unstructured":"Torralba, A. (2003). IJCV, 53(2), 169\u2013191.","DOI":"10.1023\/A:1023052124951"},{"key":"958_CR59","doi-asserted-by":"crossref","unstructured":"Xiao, J., Hays, J., Ehinger, K.A., Oliva, A., & Torralba, A. (2010). SUN database: Large-scale scene recognition from abbey to zoo. In Conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2010.5539970"},{"key":"958_CR60","doi-asserted-by":"crossref","unstructured":"Yang, Y., Baker, S., Kannan, A., & Ramanan, D. (2012). Recognizing proxemics in personal photos. In Conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2012.6248095"},{"key":"958_CR61","doi-asserted-by":"crossref","unstructured":"Yao, B., & Fei-Fei, L. (2010a). Grouplet: A structured image representation for recognizing human and object interactions. In Conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2010.5540234"},{"key":"958_CR62","doi-asserted-by":"crossref","unstructured":"Yao, B., & Fei-Fei, L. (2010b). Modeling mutual context of object and human pose in human-object interaction activities. In Conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2010.5540235"},{"key":"958_CR63","doi-asserted-by":"crossref","unstructured":"Yao, B., Jiang, X., Khosla, A., Lin, A. L., Guibas, L. J., & Fei-Fei, L. (2011). Action recognition by learning bases of action attributes and parts. In International conference on computer vision.","DOI":"10.1109\/ICCV.2011.6126386"},{"key":"958_CR64","doi-asserted-by":"crossref","unstructured":"Yao, B., Yang, X., Lin, L., Lee, M., & Zhu, S. C. (2010). I2T: Image parsing to text description. Proceedings of the IEEE, 98(8)","DOI":"10.1109\/JPROC.2010.2050411"},{"key":"958_CR65","doi-asserted-by":"crossref","unstructured":"Yatskar, M., Zettlemoyer, L., & Farhadi, A. (2016). Situation recognition: Visual semantic role labeling for image understanding. In Conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2016.597"},{"key":"958_CR66","unstructured":"Yosinski, J., Clune, J., Bengio, Y., & Lipson, H. (2014). How transferable are features in deep neural networks? In Neural information processing systems conference."},{"key":"958_CR67","unstructured":"Zhou, B., Lapedriza, A., Xiao, J., Torralba, A., & Oliva, A. (2014). Learning deep features for scene recognition using places database. In Neural information processing systems conference."},{"key":"958_CR68","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Fathi, A., & Fei-Feim, L. (2014). Reasoning about object affordances in a knowledge base representation. In European conference on computer vision","DOI":"10.1007\/978-3-319-10605-2_27"},{"key":"958_CR69","doi-asserted-by":"crossref","unstructured":"Zitnick, C. L., & Doll\u00e1r, P. (2014). Edge boxes: Locating object proposals from edges. In European conference on computer vision","DOI":"10.1007\/978-3-319-10602-1_26"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11263-016-0958-6\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-016-0958-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-016-0958-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,9,14]],"date-time":"2019-09-14T23:42:10Z","timestamp":1568504530000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11263-016-0958-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016,10,28]]},"references-count":69,"journal-issue":{"issue":"2-4","published-print":{"date-parts":[[2018,4]]}},"alternative-id":["958"],"URL":"https:\/\/doi.org\/10.1007\/s11263-016-0958-6","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"type":"print","value":"0920-5691"},{"type":"electronic","value":"1573-1405"}],"subject":[],"published":{"date-parts":[[2016,10,28]]}}}