{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,27]],"date-time":"2025-10-27T16:14:07Z","timestamp":1761581647876,"version":"3.37.3"},"publisher-location":"Cham","reference-count":25,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783319736020"},{"type":"electronic","value":"9783319736037"}],"license":[{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018]]},"DOI":"10.1007\/978-3-319-73603-7_33","type":"book-chapter","created":{"date-parts":[[2018,1,12]],"date-time":"2018-01-12T09:13:02Z","timestamp":1515748382000},"page":"405-416","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Recursive Pyramid Network with Joint Attention for Cross-Media Retrieval"],"prefix":"10.1007","author":[{"given":"Yuxin","family":"Yuan","sequence":"first","affiliation":[]},{"given":"Yuxin","family":"Peng","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2018,1,13]]},"reference":[{"issue":"8","key":"33_CR1","doi-asserted-by":"crossref","first-page":"1434","DOI":"10.1109\/TMM.2009.2032676","volume":"11","author":"Y Hu","year":"2009","unstructured":"Hu, Y., Cheng, X., Chia, L.T., et al.: Coherent phrase model for efficient image near-duplicate retrieval. IEEE Trans. Multimedia (TMM) 11(8), 1434\u20131445 (2009)","journal-title":"IEEE Trans. Multimedia (TMM)"},{"issue":"5","key":"33_CR2","doi-asserted-by":"crossref","first-page":"612","DOI":"10.1109\/TCSVT.2006.873157","volume":"16","author":"Y Peng","year":"2006","unstructured":"Peng, Y., Ngo, C.W.: Clip-based similarity measure for query-dependent clip retrieval and video summarization. IEEE Trans. Circ. Syst. Video Technol. (TCSVT) 16(5), 612\u2013627 (2006)","journal-title":"IEEE Trans. Circ. Syst. Video Technol. (TCSVT)"},{"key":"33_CR3","doi-asserted-by":"crossref","unstructured":"Peng, Y., Huang, X., Zhao, Y.: An overview of cross-media retrieval: concepts, methodologies, benchmarks and challenges. IEEE Trans. Circ. Syst. Video Technol. (TCSVT) (2017)","DOI":"10.1109\/TCSVT.2017.2705068"},{"issue":"3\/4","key":"33_CR4","doi-asserted-by":"crossref","first-page":"321","DOI":"10.2307\/2333955","volume":"28","author":"H Hotelling","year":"1936","unstructured":"Hotelling, H.: Relations between two sets of variates. Biometrika 28(3\/4), 321\u2013377 (1936)","journal-title":"Biometrika"},{"issue":"6","key":"33_CR5","doi-asserted-by":"crossref","first-page":"965","DOI":"10.1109\/TCSVT.2013.2276704","volume":"24","author":"X Zhai","year":"2014","unstructured":"Zhai, X., Peng, Y., Xiao, J.: Learning cross-media joint representation with sparse and semi-supervised regularization. IEEE Trans. Circ. Syst. Video Technol. (TCSVT) 24(6), 965\u2013978 (2014)","journal-title":"IEEE Trans. Circ. Syst. Video Technol. (TCSVT)"},{"key":"33_CR6","doi-asserted-by":"crossref","unstructured":"Feng, F., Wang, X., Li, R.: Cross-modal retrieval with correspondence autoencoder. In: 22nd ACM International Conference on Multimedia (ACM MM), pp. 7\u201316 (2014)","DOI":"10.1145\/2647868.2654902"},{"key":"33_CR7","unstructured":"Peng, Y., Huang, X., Qi, J.: Cross-media shared representation by hierarchical learning with multiple deep networks. In: International Joint Conference on Artificial Intelligence (IJCAI), pp. 3846\u20133853 (2016)"},{"issue":"2","key":"33_CR8","first-page":"449","volume":"47","author":"Y Wei","year":"2017","unstructured":"Wei, Y., Zhao, Y., Lu, C., et al.: Cross-modal retrieval with CNN visual features: a new baseline. IEEE Trans. Cybern. (TCYB) 47(2), 449\u2013460 (2017)","journal-title":"IEEE Trans. Cybern. (TCYB)"},{"key":"33_CR9","unstructured":"Mnih, V., Heess, N., Graves, A.: Recurrent models of visual attention. In: Advances in Neural Information Processing Systems (NIPS), pp. 2204\u20132212 (2014)"},{"issue":"3","key":"33_CR10","doi-asserted-by":"crossref","first-page":"583","DOI":"10.1109\/TCSVT.2015.2400779","volume":"26","author":"Y Peng","year":"2016","unstructured":"Peng, Y., Zhai, X., Zhao, Y., et al.: Semi-supervised cross-media feature learning with unified patch graph regularization. IEEE Trans. Circ. Syst. Video Technol. 26(3), 583\u2013596 (2016)","journal-title":"IEEE Trans. Circ. Syst. Video Technol."},{"key":"33_CR11","unstructured":"Peng, Y., Qi, J., Huang, X., et al.: CCL: cross-modal correlation learning with multi-grained fusion by hierarchical network. arXiv preprint arXiv:1704.02116 (2017)"},{"key":"33_CR12","doi-asserted-by":"crossref","unstructured":"Yang, Z., He, X., Gao, J., et al.: Stacked attention networks for image question answering. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 21\u201329 (2016)","DOI":"10.1109\/CVPR.2016.10"},{"key":"33_CR13","doi-asserted-by":"crossref","unstructured":"Rasiwasia, N., Costa Pereira, J., Coviello, E., et al.: A new approach to cross-modal multimedia retrieval. In: 18th ACM International Conference on Multimedia (ACM MM), pp. 251\u2013260 (2010)","DOI":"10.1145\/1873951.1873987"},{"issue":"2","key":"33_CR14","doi-asserted-by":"crossref","first-page":"210","DOI":"10.1007\/s11263-013-0658-4","volume":"106","author":"Y Gong","year":"2014","unstructured":"Gong, Y., Ke, Q., Isard, M., et al.: A multi-view embedding space for modeling internet images, tags, and their semantics. Int. J. Comput. Vis. (IJCV) 106(2), 210\u2013233 (2014)","journal-title":"Int. J. Comput. Vis. (IJCV)"},{"key":"33_CR15","doi-asserted-by":"crossref","unstructured":"Li, D., Dimitrova, N., Li, M., et al.: Multimedia content processing through cross-modal association. In: 11th ACM International Conference on Multimedia (ACM MM), pp. 604\u2013611 (2003)","DOI":"10.1145\/957013.957143"},{"key":"33_CR16","unstructured":"Rashtchian, C., Young, P., Hodosh, M., et al.: Collecting image annotations using Amazon\u2019s mechanical turk. In: NAACL HLT 2010 Workshop on Creating Speech and Language Data with Amazon\u2019s Mechanical Turk, pp. 139\u2013147 (2010)"},{"key":"33_CR17","doi-asserted-by":"crossref","unstructured":"Yan, F., Mikolajczyk, K.: Deep correlation for matching images and text. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3441\u20133450 (2015)","DOI":"10.1109\/CVPR.2015.7298966"},{"issue":"3","key":"33_CR18","doi-asserted-by":"crossref","first-page":"370","DOI":"10.1109\/TMM.2015.2390499","volume":"17","author":"C Kang","year":"2015","unstructured":"Kang, C., Xiang, S., Liao, S., et al.: Learning consistent feature representation for cross-modal multimedia retrieval. IEEE Trans. Multimedia (TMM) 17(3), 370\u2013381 (2015)","journal-title":"IEEE Trans. Multimedia (TMM)"},{"key":"33_CR19","doi-asserted-by":"crossref","unstructured":"Kim, Y.: Convolutional neural networks for sentence classification. In: Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 1746\u20131751 (2014)","DOI":"10.3115\/v1\/D14-1181"},{"key":"33_CR20","unstructured":"Simon, M., Rodner, E., Denzler, J.: Imagenet pre-trained models with batch normalization. arXiv preprint arXiv:1612.01452 (2016)"},{"key":"33_CR21","doi-asserted-by":"crossref","unstructured":"Krause, J., Jin, H., Yang, J., et al.: Fine-grained recognition without part annotations. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5546\u20135555 (2015)","DOI":"10.1109\/CVPR.2015.7299194"},{"key":"33_CR22","unstructured":"Mikolov, T., Sutskever, I., Chen, K., Corrado, G.S., Dean, J.: Distributed representations of words and phrases and their compositionality. In: Conference on Neural Information Processing Systems (NIPS), pp. 3111\u20133119 (2013)"},{"key":"33_CR23","unstructured":"Kumar, A., Irsoy, O., Ondruska, P., et al.: Ask me anything: dynamic memory networks for natural language processing. In: International Conference on Machine Learning (ICML), pp. 1378\u20131387 (2016)"},{"key":"33_CR24","unstructured":"Bahdanau, D., Cho, K., Bengio, Y.: Neural machine translation by jointly learning to align and translate. In: International Conference on Learning Representations (ICLR) (2015)"},{"key":"33_CR25","unstructured":"Xu, K., Ba, J., Kiros, R., et al.: Show, attend and tell: neural image caption generation with visual attention. In: International Conference on Machine Learning (ICML), pp. 2048\u20132057 (2015)"}],"container-title":["Lecture Notes in Computer Science","MultiMedia Modeling"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-73603-7_33","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,8,12]],"date-time":"2022-08-12T10:13:36Z","timestamp":1660299216000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-319-73603-7_33"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018]]},"ISBN":["9783319736020","9783319736037"],"references-count":25,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-73603-7_33","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2018]]}}}