{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,16]],"date-time":"2026-04-16T03:38:01Z","timestamp":1776310681409,"version":"3.50.1"},"reference-count":50,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"10","license":[{"start":{"date-parts":[[2018,10,1]],"date-time":"2018-10-01T00:00:00Z","timestamp":1538352000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2018,10,1]],"date-time":"2018-10-01T00:00:00Z","timestamp":1538352000000},"content-version":"am","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2018,10,1]],"date-time":"2018-10-01T00:00:00Z","timestamp":1538352000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2018,10,1]],"date-time":"2018-10-01T00:00:00Z","timestamp":1538352000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"US National Science Foundation","award":["IIS-1524817"],"award-info":[{"award-number":["IIS-1524817"]}]},{"name":"Google faculty research award"},{"name":"Google PhD fellowship to C.V. Yusuf Aytar"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2018,10,1]]},"DOI":"10.1109\/tpami.2017.2753232","type":"journal-article","created":{"date-parts":[[2017,9,18]],"date-time":"2017-09-18T18:08:33Z","timestamp":1505758113000},"page":"2303-2314","source":"Crossref","is-referenced-by-count":81,"title":["Cross-Modal Scene Networks"],"prefix":"10.1109","volume":"40","author":[{"given":"Yusuf","family":"Aytar","sequence":"first","affiliation":[]},{"given":"Lluis","family":"Castrejon","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5676-2387","authenticated-orcid":false,"given":"Carl","family":"Vondrick","sequence":"additional","affiliation":[]},{"given":"Hamed","family":"Pirsiavash","sequence":"additional","affiliation":[]},{"given":"Antonio","family":"Torralba","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","first-page":"935","article-title":"Zero-shot learning through\n cross-modal transfer","author":"socher","year":"2013","journal-title":"Proc Advances Neural Inf Process Syst"},{"key":"ref38","first-page":"213","article-title":"Adapting visual category models to new\n domains","author":"saenko","year":"2010","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref33","first-page":"2405","article-title":"Visually indicated sounds","author":"owens","year":"2015","journal-title":"Proc IEEE Conf Comput Vis Pattern Recog"},{"key":"ref32","article-title":"Zero-shot learning by convex\n combination of semantic embeddings","author":"norouzi","year":"2013","journal-title":"arXiv preprint arXiv 1312 5650"},{"key":"ref31","first-page":"689","article-title":"Multimodal deep learning","author":"ngiam","year":"2011","journal-title":"Proc 28th Int Conf Mach Learn"},{"key":"ref30","first-page":"1401","article-title":"Contextual models for object detection using boosted random fields","author":"murphy","year":"2004","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref37","first-page":"823","article-title":"Cluster canonical correlation analysis","author":"rasiwasia","year":"2014","journal-title":"Proc Int Conf Artif Intell Statist"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/1873951.1873987"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.466"},{"key":"ref34","first-page":"1410","article-title":"Zero-shot learning with\n semantic output codes","author":"palatucci","year":"2009","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref28","first-page":"97","article-title":"Learning transferable features with deep\n adaptation networks","author":"long","year":"2015","journal-title":"Int Conf Mach Learn"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206594"},{"key":"ref29","first-page":"2579","article-title":"Visualizing data using t-SNE","volume":"9","author":"van der maaten","year":"2008","journal-title":"J Mach Learn Res"},{"key":"ref2","doi-asserted-by":"crossref","first-page":"114","DOI":"10.1016\/j.cviu.2015.04.004","article-title":"Part level transfer regularization for enhancing exemplar SVMs","author":"aytar","year":"2015","journal-title":"Proc Comput Vis Image Understanding"},{"key":"ref1","first-page":"892","article-title":"SoundNet:\n Learning sound representations from unlabeled video","author":"aytar","year":"2016","journal-title":"Proc Advances Neural Inf Process Syst"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2005.107"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2654889"},{"key":"ref21","first-page":"2407","article-title":"Learning cross-modality\n similarity for multinomial data","author":"jia","year":"2011","journal-title":"Proc IEEE Int Conf Comput Vis"},{"key":"ref24","article-title":"Unifying visual-semantic embeddings with multimodal neural language models","author":"kiros","year":"2014","journal-title":"arXiv preprint arXiv 1411 2539"},{"key":"ref23","first-page":"158","article-title":"Undoing the damage of dataset bias","author":"khosla","year":"2012","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref26","first-page":"1097","article-title":"ImageNet\n classification with deep convolutional neural networks","author":"krizhevsky","year":"2012","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref25","first-page":"3294","article-title":"Skip-thought vectors","author":"kiros","year":"2015","journal-title":"Advances Neural Inf Process Syst"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.211"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.321"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2006.79"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2011.5995347"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.368"},{"key":"ref13","first-page":"449","article-title":"Object classification from a single example utilizing class\n relevance metrics","author":"fink","year":"2005","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref14","first-page":"2027","article-title":"Predicting object dynamics in scenes","author":"fouhey","year":"2014","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref15","first-page":"2121","article-title":"DeViSE: A deep visual-semantic embedding\n model","author":"frome","year":"2013","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref16","first-page":"354","article-title":"What makes a good\n detector?&#x2013;Structured priors for learning from few examples","author":"gao","year":"2012","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.81"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126344"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1162\/0899766042321814"},{"key":"ref4","first-page":"672","article-title":"Cross-generalization: Learning novel classes from a single example by feature replacement","author":"bart","year":"2005","journal-title":"Proc IEEE Comput Soc Conf Comput Vis Pattern Recognit"},{"key":"ref3","first-page":"4247","article-title":"Predicting deep zero-shot convolutional\n neural networks using textual descriptions","author":"ba","year":"2015","journal-title":"Proc IEEE Int Conf Comput Vision"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1142\/S0218001493000339"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/1553374.1553380"},{"key":"ref8","article-title":"Inverting convolutional networks with convolutional networks","author":"dosovitskiy","year":"2015","journal-title":"CoRR abs\/1506 02753"},{"key":"ref7","first-page":"647","article-title":"DeCAF: A\n deep convolutional activation feature for generic visual recognition","author":"donahue","year":"2014","journal-title":"Int Conf Mach Learn"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.387"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2010.266"},{"key":"ref46","first-page":"2048","article-title":"Show, attend and tell: Neural image caption generation with visual\n attention","author":"xu","year":"2015","journal-title":"Int Conf Mach Learn"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2010.5539970"},{"key":"ref48","first-page":"487","article-title":"Learning deep features for scene recognition using places database","author":"zhou","year":"2014","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref47","article-title":"Object detectors emerge in deep scene CNNs","author":"zhou","year":"2014","journal-title":"arXiv preprint arXiv 1412 6856"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.8"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.463"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298797"},{"key":"ref43","first-page":"289","article-title":"Learning visual biases from human\n imagination","author":"vondrick","year":"2015","journal-title":"Proc Advances Neural Inf Process Syst"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"https:\/\/ieeexplore.ieee.org\/ielaam\/34\/8454009\/8039215-aam.pdf","content-type":"application\/pdf","content-version":"am","intended-application":"syndication"},{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/34\/8454009\/08039215.pdf?arnumber=8039215","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,4,8]],"date-time":"2022-04-08T18:47:36Z","timestamp":1649443656000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8039215\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,10,1]]},"references-count":50,"journal-issue":{"issue":"10"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2017.2753232","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018,10,1]]}}}