{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T17:43:02Z","timestamp":1772905382166,"version":"3.50.1"},"reference-count":64,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2017,7]]},"DOI":"10.1109\/cvpr.2017.116","type":"proceedings-article","created":{"date-parts":[[2017,11,9]],"date-time":"2017-11-09T21:50:33Z","timestamp":1510264233000},"page":"1032-1041","source":"Crossref","is-referenced-by-count":20,"title":["Unsupervised Visual-Linguistic Reference Resolution in Instructional Videos"],"prefix":"10.1109","author":[{"given":"De-An","family":"Huang","sequence":"first","affiliation":[]},{"given":"Joseph J.","family":"Lim","sequence":"additional","affiliation":[]},{"given":"Li","family":"Fei-Fei","sequence":"additional","affiliation":[]},{"given":"Juan Carlos","family":"Niebles","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"crossref","first-page":"405","DOI":"10.1162\/tacl_a_00147","article-title":"Latent structures for coreference resolution","volume":"3","author":"martschat","year":"2015","journal-title":"TACL"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.9"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P16-1138"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1155"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.340"},{"key":"ref30","article-title":"Stanford's multi-pass sieve coreference resolution system at the conll-2011 shared task","author":"lee","year":"2011","journal-title":"Proceedings of the Fifteenth Conference on Computational Natural Language Learning Shared Task"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/P14-5010"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/W14-2407"},{"key":"ref35","article-title":"Whats cookin? interpreting cooking videos using text, speech and vision","author":"malmaud","year":"2015","journal-title":"HLT-NAACL"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W15-2206"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N16-1019"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2654997"},{"key":"ref61","article-title":"Modeling context in referring expressions","author":"yu","year":"2016","journal-title":"ECCV"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.11"},{"key":"ref28","doi-asserted-by":"crossref","first-page":"193","DOI":"10.1162\/tacl_a_00220","article-title":"Jointly learning to parse and perceive: Connecting natural language to the physical world","volume":"1","author":"krishnamurthy","year":"2013","journal-title":"TACL"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.387"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.455"},{"key":"ref29","article-title":"Interpreting written how-to instructions","author":"lau","year":"2009","journal-title":"IJCAI"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.495"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1091"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298990"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1086"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"ref24","article-title":"Unifying visual-semantic embeddings with multimodal neural language models","author":"kiros","year":"2015","journal-title":"TACL"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D15-1114"},{"key":"ref26","article-title":"Toward understanding natural language directions","author":"kollar","year":"2010","journal-title":"ACM\/IEEE International Conference on Human-Robot Interaction (HRI)"},{"key":"ref25","first-page":"3294","article-title":"Skip-thought vectors","author":"kiros","year":"2015","journal-title":"Advances in neural information processing systems"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.3115\/1708376.1708381"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W15-2812"},{"key":"ref59","doi-asserted-by":"crossref","DOI":"10.1609\/aaai.v29i1.9512","article-title":"Jointly modeling deep video and compositional text to bridge vision and language in a unified framework","author":"xu","year":"2015","journal-title":"AAAI"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2016.7487364"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1007\/s10994-013-5383-2"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.530"},{"key":"ref54","doi-asserted-by":"crossref","first-page":"207","DOI":"10.1162\/tacl_a_00177","article-title":"Grounded compositional semantics for finding and describing images with sentences","volume":"2","author":"socher","year":"2014","journal-title":"TACL"},{"key":"ref53","article-title":"Learning visual storylines with skipping recurrent neural networks","author":"sigurdsson","year":"2016","journal-title":"ECCV"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.509"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.516"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2009.5459279"},{"key":"ref40","article-title":"Distributed representations of words and phrases and their compositionality","author":"mikolov","year":"2013","journal-title":"NIPS"},{"key":"ref12","article-title":"Easy victories and uphill battles in coreference resolution","author":"durrett","year":"2013","journal-title":"EMNLP"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.260"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206492"},{"key":"ref15","article-title":"Cross-caption coreference resolution for automatic image understanding","author":"hodosh","year":"2010","journal-title":"Proceedings of the Fourteenth Conference on Computational Natural Language Learning"},{"key":"ref16","author":"hu","year":"2016","journal-title":"Modeling relationships in referential expressions with compositional modular networks"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N16-1147"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D15-1090"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.494"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D15-1138"},{"key":"ref3","author":"alayrac","year":"2017","journal-title":"Joint discovery of object states and manipulating actions"},{"key":"ref6","article-title":"Learning structured perceptrons for coreference resolution with latent antecedents and nonlocal features","author":"bj\u00f6rkelund","year":"2014","journal-title":"ACL"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2004.1315253"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.507"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.283"},{"key":"ref49","article-title":"Grounding of textual phrases in images by reconstruction","author":"rohrbach","year":"2016","journal-title":"ECCV"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.340"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1145\/1553374.1553482"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.303"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.117"},{"key":"ref47","article-title":"Linking people in videos with their names using coreference resolution","author":"ramanathan","year":"2014","journal-title":"ECCV"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/N15-1017"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_48"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.85"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"}],"event":{"name":"2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","location":"Honolulu, HI","start":{"date-parts":[[2017,7,21]]},"end":{"date-parts":[[2017,7,26]]}},"container-title":["2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/8097368\/8099483\/08099599.pdf?arnumber=8099599","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,8,6]],"date-time":"2022-08-06T23:24:18Z","timestamp":1659828258000},"score":1,"resource":{"primary":{"URL":"http:\/\/ieeexplore.ieee.org\/document\/8099599\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017,7]]},"references-count":64,"URL":"https:\/\/doi.org\/10.1109\/cvpr.2017.116","relation":{},"subject":[],"published":{"date-parts":[[2017,7]]}}}