{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,5]],"date-time":"2025-11-05T14:23:36Z","timestamp":1762352616437,"version":"3.37.3"},"reference-count":34,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2017,11,16]],"date-time":"2017-11-16T00:00:00Z","timestamp":1510790400000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2017,11,16]],"date-time":"2017-11-16T00:00:00Z","timestamp":1510790400000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/100000185","name":"Defense Advanced Research Projects Agency","doi-asserted-by":"publisher","award":["FA8750-16-2-0204"],"award-info":[{"award-number":["FA8750-16-2-0204"]}],"id":[{"id":"10.13039\/100000185","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100006602","name":"Air Force Research Laboratory","doi-asserted-by":"crossref","award":["FA8750-16-2-0204"],"award-info":[{"award-number":["FA8750-16-2-0204"]}],"id":[{"id":"10.13039\/100006602","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Multimed Info Retr"],"published-print":{"date-parts":[[2018,3]]},"DOI":"10.1007\/s13735-017-0139-6","type":"journal-article","created":{"date-parts":[[2017,11,16]],"date-time":"2017-11-16T14:54:58Z","timestamp":1510844098000},"page":"17-28","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["MSRC: multimodal spatial regression with semantic context for phrase grounding"],"prefix":"10.1007","volume":"7","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1415-5495","authenticated-orcid":false,"given":"Kan","family":"Chen","sequence":"first","affiliation":[]},{"given":"Rama","family":"Kovvuri","sequence":"additional","affiliation":[]},{"given":"Jiyang","family":"Gao","sequence":"additional","affiliation":[]},{"given":"Ram","family":"Nevatia","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2017,11,16]]},"reference":[{"key":"139_CR1","unstructured":"Andrej K, Li FF (2015) Deep visual-semantic alignments for generating image descriptions. In: CVPR"},{"key":"139_CR2","doi-asserted-by":"crossref","unstructured":"Antol S, Agrawal A, Lu J, Mitchell M, Batra D, Lawrence ZC, Parikh D (2015) Vqa: visual question answering. In: ICCV","DOI":"10.1109\/ICCV.2015.279"},{"key":"139_CR3","unstructured":"Chen K, Wang J, Chen LC, Gao H, Xu W, Nevatia R (2016) ABC-CNN: an attention based convolutional neural network for visual question answering. In: CVPR Workshop"},{"key":"139_CR4","doi-asserted-by":"crossref","unstructured":"Chen K, Bui T, Fang C, Wang Z, Nevatia R (2017) AMC: attention guided multi-modal correlation learning for image search. In: CVPR","DOI":"10.1109\/CVPR.2017.657"},{"key":"139_CR5","doi-asserted-by":"crossref","unstructured":"Chen K, Kovvuri R, Nevatia R (2017) Query-guided regression network with context policy for phrase grounding. In: ICCV","DOI":"10.1109\/ICCV.2017.95"},{"key":"139_CR6","doi-asserted-by":"crossref","unstructured":"Deng J, Dong W, Socher R, Li LJ, Li K, Li FF (2009) Imagenet: a large-scale hierarchical image database. In: CVPR","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"139_CR7","doi-asserted-by":"crossref","unstructured":"Everingham M, Van\u00a0Gool L, Williams CKI, Winn J, Zisserman A (2010) The PASCAL Visual Object Classes Challenge. In: IJCV","DOI":"10.1007\/s11263-009-0275-4"},{"key":"139_CR8","doi-asserted-by":"crossref","unstructured":"Fang H, Gupta S, Iandola F, Srivastava RK, Deng L, Doll\u00e1r P, Gao J, He X, Mitchell M, Platt JC, et\u00a0al (2015) From captions to visual concepts and back. In: CVPR","DOI":"10.1109\/CVPR.2015.7298754"},{"key":"139_CR9","doi-asserted-by":"crossref","unstructured":"Fukui A, Park DH, Yang D, Rohrbach A, Darrell T, Rohrbach M (2016) Multimodal compact bilinear pooling for visual question answering and visual grounding. In: EMNLP","DOI":"10.18653\/v1\/D16-1044"},{"key":"139_CR10","doi-asserted-by":"crossref","unstructured":"Girshick R (2015) Fast r-cnn. In: ICCV","DOI":"10.1109\/ICCV.2015.169"},{"key":"139_CR11","unstructured":"Glorot X, Bengio Y (2010) Understanding the difficulty of training deep feedforward neural networks. In: Aistats"},{"key":"139_CR12","doi-asserted-by":"crossref","unstructured":"Gordo A, Almaz\u00e1n J, Revaud J, Larlus D (2016) Deep image retrieval: learning global representations for image search. In: ECCV","DOI":"10.1007\/978-3-319-46466-4_15"},{"key":"139_CR13","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2015) Delving deep into rectifiers: Surpassing human-level performance on imagenet classification. In: CVPR","DOI":"10.1109\/ICCV.2015.123"},{"key":"139_CR14","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter S, Schmidhuber J (1997) Long short-term memory. Neural Comput 9:1735\u20131780","journal-title":"Neural Comput"},{"key":"139_CR15","doi-asserted-by":"crossref","unstructured":"Hu R, Xu H, Rohrbach M, Feng J, Saenko K, Darrell T (2016) Natural language object retrieval. In: CVPR","DOI":"10.1109\/CVPR.2016.493"},{"key":"139_CR16","unstructured":"Justin J, Andrej K, Li FF (2016) Densecap: fully convolutional localization networks for dense captioning. In: CVPR"},{"key":"139_CR17","doi-asserted-by":"crossref","unstructured":"Kazemzadeh S, Ordonez V, Matten M, Berg TL (2014) Referit game: referring to objects in photographs of natural scenes. In: EMNLP","DOI":"10.3115\/v1\/D14-1086"},{"key":"139_CR18","doi-asserted-by":"crossref","unstructured":"Kantorov V, Oquab M, Cho M, Laptev I (2016) Contextlocnet: context-aware deep network models for weakly supervised localization. In: ECCV","DOI":"10.1007\/978-3-319-46454-1_22"},{"key":"139_CR19","unstructured":"Karpathy A, Joulin A, Li FF (2014) Deep fragment embeddings for bidirectional image sentence mapping. In: NIPS"},{"key":"139_CR20","unstructured":"Kingma D, Ba J (2015) Adam: a method for stochastic optimization. In: ICLR"},{"key":"139_CR21","doi-asserted-by":"crossref","unstructured":"Krishnamurthy J, Kollar T (2013) Jointly learning to parse and perceive: Connecting natural language to the physical world. In: TACL","DOI":"10.1162\/tacl_a_00220"},{"key":"139_CR22","doi-asserted-by":"crossref","unstructured":"Liu W, Anguelov D, Erhan D, Szegedy C, Reed S, Fu CY, Berg AC (2016) SSD: single shot multibox detector. In: ECCV","DOI":"10.1007\/978-3-319-46448-0_2"},{"key":"139_CR23","unstructured":"Matuszek C, FitzGerald N, Zettlemoyer L, Bo L, Fox D (2012) A joint model of language and perception for grounded attribute learning. In: ICML"},{"key":"139_CR24","doi-asserted-by":"crossref","unstructured":"Nagaraja VK, Morariu VI, Davis LS (2016) Modeling context between objects for referring expression understanding. In: ECCV","DOI":"10.1007\/978-3-319-46493-0_48"},{"key":"139_CR25","doi-asserted-by":"crossref","unstructured":"Plummer BA, Wang L, Cervantes CM, Caicedo JC, Hockenmaier J, Lazebnik S (2016) Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. In: IJCV","DOI":"10.1109\/ICCV.2015.303"},{"key":"139_CR26","doi-asserted-by":"crossref","unstructured":"Radenovi\u0107 F, Tolias G, Chum O (2016) CNN image retrieval learns from bow: unsupervised fine-tuning with hard examples. In: ECCV","DOI":"10.1007\/978-3-319-46448-0_1"},{"key":"139_CR27","doi-asserted-by":"crossref","unstructured":"Redmon J, Divvala S, Girshick R, Farhadi A (2016) You only look once: unified real-time object detection. In: CVPR","DOI":"10.1109\/CVPR.2016.91"},{"key":"139_CR28","unstructured":"Ren S, He K, Girshick R, Sun J (2015) Faster r-cnn: towards real-time object detection with region proposal networks. In: NIPS"},{"key":"139_CR29","doi-asserted-by":"crossref","unstructured":"Rohrbach A, Rohrbach M, Hu R, Darrell T, Schiele B (2016) Grounding of textual phrases in images by reconstruction. In: ECCV","DOI":"10.1007\/978-3-319-46448-0_49"},{"key":"139_CR30","unstructured":"Simonyan K, Zisserman A (2014) Very deep convolutional networks for large-scale image recognition. In: CoRR"},{"key":"139_CR31","doi-asserted-by":"crossref","unstructured":"Uijlings JR, Van De Sande KE, Gevers T, Smeulders AW (2013) Selective search for object recognition. In: IJCV","DOI":"10.1007\/s11263-013-0620-5"},{"key":"139_CR32","doi-asserted-by":"crossref","unstructured":"Wang M, Azab M, Kojima N, Mihalcea R, Deng J (2016) Structured matching for phrase localization. In: ECCV","DOI":"10.1007\/978-3-319-46484-8_42"},{"key":"139_CR33","doi-asserted-by":"crossref","unstructured":"Yu L, Poirson P, Yang S, Berg AC, Berg TL (2016) Modeling context in referring expressions. In: ECCV","DOI":"10.1007\/978-3-319-46475-6_5"},{"key":"139_CR34","doi-asserted-by":"crossref","unstructured":"Zitnick CL, Doll\u00e1r P (2014) Edge boxes: locating object proposals from edges. In: ECCV","DOI":"10.1007\/978-3-319-10602-1_26"}],"container-title":["International Journal of Multimedia Information Retrieval"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s13735-017-0139-6\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s13735-017-0139-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s13735-017-0139-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,5,14]],"date-time":"2020-05-14T07:55:21Z","timestamp":1589442921000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s13735-017-0139-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017,11,16]]},"references-count":34,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2018,3]]}},"alternative-id":["139"],"URL":"https:\/\/doi.org\/10.1007\/s13735-017-0139-6","relation":{},"ISSN":["2192-6611","2192-662X"],"issn-type":[{"type":"print","value":"2192-6611"},{"type":"electronic","value":"2192-662X"}],"subject":[],"published":{"date-parts":[[2017,11,16]]},"assertion":[{"value":"17 August 2017","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 October 2017","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 November 2017","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"16 November 2017","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}