{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,17]],"date-time":"2026-03-17T00:55:11Z","timestamp":1773708911810,"version":"3.50.1"},"reference-count":53,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"11","license":[{"start":{"date-parts":[[2018,11,1]],"date-time":"2018-11-01T00:00:00Z","timestamp":1541030400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61771025"],"award-info":[{"award-number":["61771025"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61532005"],"award-info":[{"award-number":["61532005"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. on Image Process."],"published-print":{"date-parts":[[2018,11]]},"DOI":"10.1109\/tip.2018.2852503","type":"journal-article","created":{"date-parts":[[2018,7,2]],"date-time":"2018-07-02T18:42:09Z","timestamp":1530556929000},"page":"5585-5599","source":"Crossref","is-referenced-by-count":143,"title":["Modality-Specific Cross-Modal Similarity Measurement With Recurrent Attention Network"],"prefix":"10.1109","volume":"27","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8398-3513","authenticated-orcid":false,"given":"Yuxin","family":"Peng","sequence":"first","affiliation":[]},{"given":"Jinwei","family":"Qi","sequence":"additional","affiliation":[]},{"given":"Yuxin","family":"Yuan","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D15-1044"},{"key":"ref38","first-page":"1693","article-title":"Teaching machines to read and comprehend","author":"hermann","year":"2015","journal-title":"Proc Conf Neural Inf Process Syst"},{"key":"ref33","author":"peng","year":"2017","journal-title":"CM-GANs Cross-modal generative adversarial networks for common representation learning"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2016.2558463"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2016.2519449"},{"key":"ref30","first-page":"1247","article-title":"Deep canonical correlation analysis","author":"andrew","year":"2013","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref37","first-page":"1","article-title":"Reasoning about entailment with neural attention","author":"rockt\u00e4schel","year":"2016","journal-title":"Proc Int Conf Learn Represent"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.10"},{"key":"ref35","first-page":"1","article-title":"DRAW: A recurrent neural network for image generation","author":"gregor","year":"2015","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref34","first-page":"2204","article-title":"Recurrent models of visual attention","author":"mnih","year":"2014","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref28","first-page":"1","article-title":"Multimodal deep learning","author":"ngiam","year":"2011","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref29","first-page":"1","article-title":"Learning representations for multimodal data with deep belief nets","author":"srivastava","year":"2012","journal-title":"Proc Int Machine Learning Workshop"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2006.873157"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2015.2421443"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.466"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2004.1326716"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/957142.957143"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2015.2400779"},{"key":"ref23","first-page":"1198","article-title":"Heterogeneous metric learning with joint graph regularization for cross-media retrieval","author":"zhai","year":"2013","journal-title":"Proc AAAI Conf Artif Intell"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2964328"},{"key":"ref25","first-page":"1097","article-title":"ImageNet classification with deep convolutional neural networks","author":"krizhevsky","year":"2012","journal-title":"Proc Conf Neural Inf Process Syst"},{"key":"ref50","first-page":"1","article-title":"Batch normalization: Accelerating deep network training by reducing internal covariate shift","author":"ioffe","year":"2015","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1162\/0899766042321814"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2017.2742704"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2015.2390499"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2015.2505311"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2015.2403240"},{"key":"ref40","first-page":"1","article-title":"Very deep convolutional networks for large-scale image recognition","author":"simonyan","year":"2014","journal-title":"Proc Int Conf Learn Represent"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2015.2507401"},{"key":"ref13","first-page":"3846","article-title":"Cross-media shared representation by hierarchical learning with multiple deep networks","author":"peng","year":"2016","journal-title":"Proc Int Joint Conf Artif Intell"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2654902"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298966"},{"key":"ref16","first-page":"2048","article-title":"Show, attend and tell: Neural image caption generation with visual attention","author":"xu","year":"2015","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref17","first-page":"289","article-title":"Hierarchical question-image co-attention for visual question answering","author":"lu","year":"2016","journal-title":"Proc Conf Neural Inf Process Syst"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1093\/biomet\/28.3-4.321"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2013.142"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1145\/1873951.1873987"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2017.2705068"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2016.2592800"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2017.2676345"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-013-0658-4"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2015.2466106"},{"key":"ref49","author":"hinton","year":"2012","journal-title":"Improving Neural Networks by Preventing Co-adaptation of Feature Detectors"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2013.2276704"},{"key":"ref46","first-page":"139","article-title":"Collecting image annotations using Amazon&#x2019;s mechanical turk","author":"rashtchian","year":"2010","journal-title":"NAACL HLT Workshop on Creating Speech and Language Data with Amazon's Mechanical Turk"},{"key":"ref45","author":"kong","year":"2016","journal-title":"Coarse2Fine Two-layer fusion for image retrieval"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299073"},{"key":"ref47","first-page":"740","article-title":"ar, and C. L. Zitnick, &#x201C;Microsoft COCO: Common objects in context","author":"lin","year":"2014","journal-title":"Proc Eur Conf Comput Vis (ECCV)"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref41","author":"lin","year":"2017","journal-title":"Task-driven visual saliency and attention-based visual question answering"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1181"},{"key":"ref43","first-page":"3111","article-title":"Distributed representations of words and phrases and their compositionality","author":"mikolov","year":"2013","journal-title":"Proc Adv Neural Inf Process Syst"}],"container-title":["IEEE Transactions on Image Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/83\/8421670\/08401908.pdf?arnumber=8401908","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,1,12]],"date-time":"2022-01-12T16:29:26Z","timestamp":1642004966000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8401908\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,11]]},"references-count":53,"journal-issue":{"issue":"11"},"URL":"https:\/\/doi.org\/10.1109\/tip.2018.2852503","relation":{},"ISSN":["1057-7149","1941-0042"],"issn-type":[{"value":"1057-7149","type":"print"},{"value":"1941-0042","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018,11]]}}}