{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,8]],"date-time":"2026-01-08T06:00:06Z","timestamp":1767852006418,"version":"3.49.0"},"reference-count":61,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"7","license":[{"start":{"date-parts":[[2016,7,1]],"date-time":"2016-07-01T00:00:00Z","timestamp":1467331200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Basic Research Program of China","doi-asserted-by":"crossref","award":["2012CB316304"],"award-info":[{"award-number":["2012CB316304"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"crossref"}]},{"name":"Strategic Priority Research Program of the CAS","award":["XDB02060009"],"award-info":[{"award-number":["XDB02060009"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61272331"],"award-info":[{"award-number":["61272331"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["91338202"],"award-info":[{"award-number":["91338202"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004826","name":"Beijing Natural Science Foundation","doi-asserted-by":"publisher","award":["4162064"],"award-info":[{"award-number":["4162064"]}],"id":[{"id":"10.13039\/501100004826","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Multimedia"],"published-print":{"date-parts":[[2016,7]]},"DOI":"10.1109\/tmm.2016.2558463","type":"journal-article","created":{"date-parts":[[2016,4,28]],"date-time":"2016-04-28T16:19:09Z","timestamp":1461860349000},"page":"1363-1377","source":"Crossref","is-referenced-by-count":119,"title":["Cross-Modal Retrieval via Deep and Bidirectional Representation Learning"],"prefix":"10.1109","volume":"18","author":[{"given":"Yonghao","family":"He","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2089-9733","authenticated-orcid":false,"given":"Shiming","family":"Xiang","sequence":"additional","affiliation":[]},{"given":"Cuicui","family":"Kang","sequence":"additional","affiliation":[]},{"given":"Jian","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Chunhong","family":"Pan","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"crossref","first-page":"853","DOI":"10.1613\/jair.3994","article-title":"Framing image description as a ranking task: Data, models and evaluation metrics","volume":"47","author":"hodosh","year":"2013","journal-title":"J Artif Intell Res"},{"key":"ref38","doi-asserted-by":"crossref","first-page":"67","DOI":"10.1162\/tacl_a_00166","article-title":"From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions","volume":"2","author":"young","year":"2014","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"ref33","article-title":"Text understanding from scratch","author":"zhang","year":"2015","journal-title":"CoRR"},{"key":"ref32","article-title":"Effective use of word order for text categorization with convolutional neural networks","author":"johnson","year":"2014","journal-title":"CoRR"},{"key":"ref31","first-page":"69","article-title":"Deep convolutional neural networks for sentiment analysis of short texts","author":"santos","year":"0","journal-title":"Proc Int Conf Comput Linguistics"},{"key":"ref30","first-page":"1","article-title":"A convolutional neural network for modelling sentences","author":"blunsom","year":"0","journal-title":"Proc 32nd Ann Meeting Assoc for Computational Linguistics"},{"key":"ref37","first-page":"13","article-title":"The IAPR TC-12 benchmark: A new evaluation resource for visual information systems","author":"grubinger","year":"0","journal-title":"Proc Int Workshop OntoImage"},{"key":"ref36","first-page":"533","article-title":"Learning representations by back-propagating errors","volume":"5","author":"rumelhart","year":"1988","journal-title":"Cognitive Modeling"},{"key":"ref35","first-page":"1367","article-title":"A deep architecture for matching short texts","author":"lu","year":"0","journal-title":"Proc Adv Neural Inform Process Syst"},{"key":"ref34","first-page":"2042","article-title":"Convolutional neural network architectures for matching natural language sentences","author":"hu","year":"0","journal-title":"Proc Adv Neural Inform Process Syst"},{"key":"ref60","article-title":"MatConvNet&#x2013;convolutional neural networks for MATLAB","author":"vedaldi","year":"2014","journal-title":"CoRR"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511809071"},{"key":"ref28","first-page":"1799","article-title":"Joint training of a convolutional network and a graphical model for human pose estimation","author":"tompson","year":"0","journal-title":"Proc Adv Neural Inform Process Syst"},{"key":"ref27","first-page":"1988","article-title":"Deep learning face representation by joint identification-verification","author":"sun","year":"0","journal-title":"Proc Adv Neural Inform Process Syst"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1181"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.261"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1145\/1873951.1873987"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1007\/11752790_2"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6247923"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1162\/089976600300015349"},{"key":"ref24","first-page":"689","article-title":"Multimodal deep learning","author":"ngiam","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref23","first-page":"2222","article-title":"Multimodal learning with deep Boltzmann machines","author":"srivastava","year":"0","journal-title":"Proc Adv Neural Inform Process Syst"},{"key":"ref26","first-page":"1097","article-title":"Imagenet classification with deep convolutional neural networks","author":"krizhevsky","year":"0","journal-title":"Proc Adv Neural Inform Process Syst"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/5.726791"},{"key":"ref50","article-title":"Show and tell: A neural image caption generator","author":"vinyals","year":"2014","journal-title":"CoRR"},{"key":"ref51","first-page":"129","article-title":"Parsing natural scenes and natural language with recursive neural networks","author":"socher","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref59","first-page":"248","article-title":"ImageNet: A large-scale hierarchical image database","author":"deng","year":"0","journal-title":"Proc Int Conf Comput Vis Pattern Recog"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2654889"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2014.131"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1145\/2505515.2505665"},{"key":"ref55","article-title":"Efficient estimation of word representations in vector space","author":"mikolov","year":"2013","journal-title":"CoRR"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1007\/s00778-015-0391-4"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.81"},{"key":"ref52","first-page":"1017","article-title":"Generating text with recurrent neural networks","author":"sutskever","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2012.6288383"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2007.911822"},{"key":"ref40","first-page":"1889","article-title":"Deep fragment embeddings for bidirectional image sentence mapping","author":"karpathy","year":"0","journal-title":"Proc Advances Neural Inform Process Syst"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1145\/2461466.2461497"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1023\/B:VISI.0000029664.99615.94"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1023\/A:1011139631724"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/76.927424"},{"key":"ref16","doi-asserted-by":"crossref","DOI":"10.1142\/3838","volume":"100","author":"mukundan","year":"1998","journal-title":"Moment Functions in Image Analysis Theory and Applications"},{"key":"ref17","first-page":"993","article-title":"Latent Dirichlet allocation","volume":"3","author":"blei","year":"2003","journal-title":"J Mach Learn Res"},{"key":"ref18","first-page":"1607","article-title":"Replicated softmax: An undirected topic model","author":"hinton","year":"0","journal-title":"Proc Adv Neural Inform Process Syst"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1162\/0899766042321814"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2654902"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1145\/2502081.2502097"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2015.2390499"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.14778\/2732296.2732301"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/2600428.2609563"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-27355-1_30"},{"key":"ref49","doi-asserted-by":"crossref","first-page":"207","DOI":"10.1162\/tacl_a_00177","article-title":"Grounded compositional semantics for finding and describing images with sentences","volume":"2","author":"socher","year":"2014","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1145\/2484028.2484039"},{"key":"ref46","doi-asserted-by":"crossref","first-page":"504","DOI":"10.1126\/science.1127647","article-title":"Reducing the dimensionality of data with neural networks","volume":"313","author":"hinton","year":"2006","journal-title":"Science"},{"key":"ref45","first-page":"448","article-title":"Deep Boltzmann machines","author":"salakhutdinov","year":"0","journal-title":"Proc Int Conf Artif Intell Statist"},{"key":"ref48","article-title":"Long-term recurrent convolutional networks for visual recognition and description","author":"donahue","year":"2014","journal-title":"CoRR"},{"key":"ref47","article-title":"Deep visual-semantic alignments for generating image descriptions","author":"karpathy","year":"2014","journal-title":"CoRR"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2011.5995350"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-013-0658-4"},{"key":"ref44","first-page":"2407","article-title":"Learning cross-modality similarity for multinomial data","author":"jia","year":"0","journal-title":"Proc IEEE Int Conf Comput Vis"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1145\/860458.860460"}],"container-title":["IEEE Transactions on Multimedia"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6046\/7492217\/07460254.pdf?arnumber=7460254","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,1,12]],"date-time":"2022-01-12T16:11:00Z","timestamp":1642003860000},"score":1,"resource":{"primary":{"URL":"http:\/\/ieeexplore.ieee.org\/document\/7460254\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016,7]]},"references-count":61,"journal-issue":{"issue":"7"},"URL":"https:\/\/doi.org\/10.1109\/tmm.2016.2558463","relation":{},"ISSN":["1520-9210","1941-0077"],"issn-type":[{"value":"1520-9210","type":"print"},{"value":"1941-0077","type":"electronic"}],"subject":[],"published":{"date-parts":[[2016,7]]}}}