{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T00:50:20Z","timestamp":1740099020280,"version":"3.37.3"},"publisher-location":"Cham","reference-count":26,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783319736020"},{"type":"electronic","value":"9783319736037"}],"license":[{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018]]},"DOI":"10.1007\/978-3-319-73603-7_13","type":"book-chapter","created":{"date-parts":[[2018,1,12]],"date-time":"2018-01-12T09:13:02Z","timestamp":1515748382000},"page":"154-165","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Deep Convolutional Neural Network for\u00a0Correlating Images and Sentences"],"prefix":"10.1007","author":[{"given":"Yuhua","family":"Jia","sequence":"first","affiliation":[]},{"given":"Liang","family":"Bai","sequence":"additional","affiliation":[]},{"given":"Peng","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Jinlin","family":"Guo","sequence":"additional","affiliation":[]},{"given":"Yuxiang","family":"Xie","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2018,1,13]]},"reference":[{"key":"13_CR1","unstructured":"Socher, R., Karpathy, A., Le, Q.V., et al.: Grounded compositional semantics for finding and describing images with sentences (2013). Nlp.stanford.edu"},{"key":"13_CR2","doi-asserted-by":"crossref","unstructured":"Eisenschtat, A., Wolf, L.: Linking Image and Text with 2-Way Nets (2017)","DOI":"10.1109\/CVPR.2017.201"},{"key":"13_CR3","doi-asserted-by":"crossref","unstructured":"Girshick, R., Donahue, J., Darrell, T., et al.: Rich feature hierarchies for accurate object detection and semantic segmentation. In: Computer Vision and Pattern Recognition, pp. 580\u2013587. IEEE (2013)","DOI":"10.1109\/CVPR.2014.81"},{"key":"13_CR4","unstructured":"Karpathy, A., Joulin, A., Li, F.F.: Deep fragment embeddings for bidirectional image sentence mapping. In: Advances in Neural Information Processing Systems, vol. 3, pp. 1889\u20131897 (2014)"},{"issue":"1","key":"13_CR5","doi-asserted-by":"crossref","first-page":"74","DOI":"10.1007\/s11263-016-0965-7","volume":"123","author":"BA Plummer","year":"2017","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., et al.: Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. Int. J. Comput. Vis. 123(1), 74\u201393 (2017)","journal-title":"Int. J. Comput. Vis."},{"key":"13_CR6","doi-asserted-by":"crossref","unstructured":"Ma, L., Lu, Z., Shang, L., et al.: Multimodal convolutional neural networks for matching image and sentence. In: IEEE International Conference on Computer Vision, pp. 2623\u20132631 (2015)","DOI":"10.1109\/ICCV.2015.301"},{"key":"13_CR7","doi-asserted-by":"crossref","unstructured":"Huang, Y., Wang, W., Wang, L.: Instance-aware Image and Sentence Matching with Selective Multimodal LSTM (2016)","DOI":"10.1109\/CVPR.2017.767"},{"key":"13_CR8","unstructured":"Mikolov, T., Chen, K., Corrado, G., et al.: Efficient estimation of word representations in vector space. arXiv preprint arXiv:1301.3781 (2013)"},{"key":"13_CR9","doi-asserted-by":"crossref","unstructured":"Kalchbrenner, N., Grefenstette, E., Blunsom, P.: A convolutional neural network for modelling sentences. Eprint Arxiv, p. 1 (2014)","DOI":"10.3115\/v1\/P14-1062"},{"key":"13_CR10","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., et al.: Show and tell: a neural image caption generator. Comput. Sci. 3156\u20133164 (2015)","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"13_CR11","doi-asserted-by":"crossref","unstructured":"Ghosh, S., Das, N., Goncalves, T., et al.: Representing image captions as concept graphs using semantic information. In: International Conference on Advances in Computing, Communications and Informatics, pp. 162\u2013167 (2016)","DOI":"10.1109\/ICACCI.2016.7732041"},{"key":"13_CR12","doi-asserted-by":"crossref","unstructured":"Fang, H., Gupta, S., Iandola, F., et al.: From captions to visual concepts and back. Eprint Arxiv, pp. 1473\u20131482 (2014)","DOI":"10.1109\/CVPR.2015.7298754"},{"key":"13_CR13","doi-asserted-by":"crossref","unstructured":"Wang, C., Yang, H., Bartz, C., et al.: Image captioning with deep bidirectional LSTMs. In: ACM on Multimedia Conference, pp. 988\u2013997 (2016)","DOI":"10.1145\/2964284.2964299"},{"key":"13_CR14","unstructured":"Dong, J., Li, X., Snoek, C.G.M.: Word2VisualVec: image and video to sentence matching by visual feature prediction. Arxiv (2016)"},{"key":"13_CR15","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)"},{"key":"13_CR16","doi-asserted-by":"crossref","unstructured":"Caruana, R., Lawrence, S., Giles, L.: Overfitting in neural nets: backpropagation, conjugate gradient, and early stopping. In: International Conference on Neural Information Processing Systems, pp. 381\u2013387. MIT Press (2000)","DOI":"10.1109\/IJCNN.2000.857823"},{"issue":"4","key":"13_CR17","first-page":"212","volume":"3","author":"GE Hinton","year":"2012","unstructured":"Hinton, G.E., Srivastava, N., Krizhevsky, A., et al.: Improving neural networks by preventing co-adaptation of feature detectors. Comput. Sci. 3(4), 212\u2013223 (2012)","journal-title":"Comput. Sci."},{"issue":"1","key":"13_CR18","doi-asserted-by":"crossref","first-page":"853","DOI":"10.1613\/jair.3994","volume":"47","author":"M Hodosh","year":"2013","unstructured":"Hodosh, M., Young, P., Hockenmaier, J.: Framing image description as a ranking task: data, models and evaluation metrics. J. Artif. Intell. Res. 47(1), 853\u2013899 (2013)","journal-title":"J. Artif. Intell. Res."},{"key":"13_CR19","unstructured":"Young, P., Lai, A., Hodosh, M., et al.: From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions (2014). Nlp.cs.illinois.edu"},{"key":"13_CR20","unstructured":"Mao, J., Xu, W., Yang, Y., et al.: Explain images with multimodal recurrent neural networks. arXiv preprint arXiv:1410.1090 (2014)"},{"key":"13_CR21","doi-asserted-by":"crossref","unstructured":"Yan, F., Mikolajczyk, K.: Deep correlation for matching images and text. In: Computer Vision and Pattern Recognition, pp. 3441\u20133450. IEEE (2015)","DOI":"10.1109\/CVPR.2015.7298966"},{"key":"13_CR22","unstructured":"Klein, B., Lev, G., Sadeh, G., et al.: Fisher vectors derived from hybrid gaussian-laplacian mixture models for image annotation. Eprint Arxiv (2014)"},{"key":"13_CR23","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"328","DOI":"10.1007\/978-3-319-48890-5_32","volume-title":"Advances in Multimedia Information Processing \u2013 PCM 2016","author":"T Yu","year":"2016","unstructured":"Yu, T., Bai, L., Guo, J., Yang, Z., Xie, Y.: A deep two-stream network for bidirectional cross-media information retrieval. In: Chen, E., Gong, Y., Tie, Y. (eds.) PCM 2016. LNCS, vol. 9916, pp. 328\u2013337. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-48890-5_32"},{"issue":"3","key":"13_CR24","doi-asserted-by":"crossref","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., Deng, J., et al.: Imagenet large scale visual recognition challenge. Int. J. Comput. Vis. 115(3), 211\u2013252 (2015)","journal-title":"Int. J. Comput. Vis."},{"key":"13_CR25","unstructured":"Mnih, V., Heess, N., et al.: Recurrent models of visual attention. In: International Conference on Neural Information Processing Systems, pp. 2204\u20132212. MIT Press (2014)"},{"key":"13_CR26","unstructured":"Hu, B., Lu, Z., Li, H., Chen, Q.: Convolutional neural network architectures for matching natural language sentences. In: NIPS 2014 (2014)"}],"container-title":["Lecture Notes in Computer Science","MultiMedia Modeling"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-73603-7_13","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,8,12]],"date-time":"2022-08-12T10:10:29Z","timestamp":1660299029000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-319-73603-7_13"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018]]},"ISBN":["9783319736020","9783319736037"],"references-count":26,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-73603-7_13","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2018]]}}}