{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,9,9]],"date-time":"2024-09-09T11:37:33Z","timestamp":1725881853420},"publisher-location":"Cham","reference-count":35,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783319541891"},{"type":"electronic","value":"9783319541907"}],"license":[{"start":{"date-parts":[[2017,1,1]],"date-time":"2017-01-01T00:00:00Z","timestamp":1483228800000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2017]]},"DOI":"10.1007\/978-3-319-54190-7_8","type":"book-chapter","created":{"date-parts":[[2017,3,11]],"date-time":"2017-03-11T05:44:09Z","timestamp":1489211049000},"page":"120-135","source":"Crossref","is-referenced-by-count":0,"title":["Variational Convolutional Networks for Human-Centric Annotations"],"prefix":"10.1007","author":[{"given":"Tsung-Wei","family":"Ke","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Che-Wei","family":"Lin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tyng-Luh","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Davi","family":"Geiger","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2017,3,12]]},"reference":[{"key":"8_CR1","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Fei-Fei, L.: Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3128\u20133137 (2015)","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"8_CR2","doi-asserted-by":"crossref","unstructured":"Donahue, J., Anne Hendricks, L., Guadarrama, S., Rohrbach, M., Venugopalan, S., Saenko, K., Darrell, T.: Long-term recurrent convolutional networks for visual recognition and description. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2625\u20132634 (2015)","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"8_CR3","unstructured":"Xu, K., Ba, J., Kiros, R., Courville, A., Salakhutdinov, R., Zemel, R., Bengio, Y.: Show, attend and tell: Neural image caption generation with visual attention. arXiv preprint arXiv:1502.03044 (2015)"},{"key":"8_CR4","doi-asserted-by":"crossref","unstructured":"Wu, Q., Shen, C., Liu, L., Dick, A., van den Hengel, A.: What value do explicit high level concepts have in vision to language problems? In: Proceedings IEEE Conference Computer Vision and Pattern Recognition, vol. 2(4) (2016)","DOI":"10.1109\/CVPR.2016.29"},{"key":"8_CR5","doi-asserted-by":"crossref","unstructured":"Yao, L., Torabi, A., Cho, K., Ballas, N., Pal, C., Larochelle, H., Courville, A.: Describing videos by exploiting temporal structure. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 4507\u20134515 (2015)","DOI":"10.1109\/ICCV.2015.512"},{"key":"8_CR6","doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Rohrbach, M., Donahue, J., Mooney, R., Darrell, T., Saenko, K.: Sequence to sequence-video to text. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 4534\u20134542 (2015)","DOI":"10.1109\/ICCV.2015.515"},{"key":"8_CR7","doi-asserted-by":"crossref","unstructured":"Antol, S., Agrawal, A., Lu, J., Mitchell, M., Batra, D., Zitnick, C.L., Parikh, D.: Vqa: Visual question answering. In: International Conference on Computer Vision (ICCV) (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"8_CR8","unstructured":"Mansimov, E., Parisotto, E., Ba, J., Salakhutdinov, R.: Generating images from captions with attention. In: ICLR (2016)"},{"key":"8_CR9","doi-asserted-by":"crossref","unstructured":"Berg, A.C., Berg, T.L., Daume III., H., Dodge, J., Goyal, A., Han, X., Mensch, A., Mitchell, M., Sood, A., Stratos, K., et al.: Understanding and predicting importance in images. In: 2012 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3562\u20133569. IEEE (2012)","DOI":"10.1109\/CVPR.2012.6248100"},{"key":"8_CR10","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., Zitnick, C.L.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). doi: 10.1007\/978-3-319-10602-1_48"},{"key":"8_CR11","doi-asserted-by":"crossref","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., Caicedo, J.C., Hockenmaier, J., Lazebnik, S.: Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2641\u20132649 (2015)","DOI":"10.1109\/ICCV.2015.303"},{"key":"8_CR12","doi-asserted-by":"crossref","unstructured":"Misra, I., Zitnick, C.L., Mitchell, M., Girshick, R.: Seeing through the human reporting bias: visual classifiers from noisy human-centric labels. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.320"},{"key":"8_CR13","doi-asserted-by":"crossref","unstructured":"Fang, H., Gupta, S., Iandola, F., Srivastava, R.K., Deng, L., Doll\u00e1r, P., Gao, J., He, X., Mitchell, M., Platt, J.C., et al.: From captions to visual concepts and back. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1473\u20131482 (2015)","DOI":"10.1109\/CVPR.2015.7298754"},{"key":"8_CR14","unstructured":"Kingma, D.P., Welling, M.: Auto-encoding variational bayes. In: Proceedings of the 2nd International Conference on Learning Representations (ICLR), vol. 2014 (2013)"},{"key":"8_CR15","unstructured":"Devlin, J., Gupta, S., Girshick, R., Mitchell, M., Zitnick, C.L.: Exploring nearest neighbor approaches for image captioning. arXiv preprint arXiv:1505.04467 (2015)"},{"key":"8_CR16","doi-asserted-by":"crossref","unstructured":"Klein, B., Lev, G., Sadeh, G., Wolf, L.: Associating neural word embeddings with deep image representations using fisher vectors. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4437\u20134446 (2015)","DOI":"10.1109\/CVPR.2015.7299073"},{"key":"8_CR17","doi-asserted-by":"crossref","unstructured":"Jia, X., Gavves, E., Fernando, B., Tuytelaars, T.: Guiding long-short term memory for image caption generation. arXiv preprint arXiv:1509.04942 (2015)","DOI":"10.1109\/ICCV.2015.277"},{"key":"8_CR18","doi-asserted-by":"crossref","unstructured":"Rohrbach, M., Qiu, W., Titov, I., Thater, S., Pinkal, M., Schiele, B.: Translating video content to natural language descriptions. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 433\u2013440 (2013)","DOI":"10.1109\/ICCV.2013.61"},{"key":"8_CR19","unstructured":"Thomason, J., Venugopalan, S., Guadarrama, S., Saenko, K., Mooney, R.J.: Integrating language and vision to generate natural language descriptions of videos in the wild. In: COLING, vol. 9 (2014)"},{"key":"8_CR20","doi-asserted-by":"crossref","unstructured":"Turakhia, N., Parikh, D.: Attribute dominance: What pops out? In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1225\u20131232 (2013)","DOI":"10.1109\/ICCV.2013.155"},{"key":"8_CR21","doi-asserted-by":"crossref","unstructured":"Yun, K., Peng, Y., Samaras, D., Zelinsky, G., Berg, T.: Studying relationships between human gaze, description, and computer vision. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 739\u2013746 (2013)","DOI":"10.1109\/CVPR.2013.101"},{"key":"8_CR22","doi-asserted-by":"crossref","unstructured":"Chen, X., Gupta, A.: Webly supervised learning of convolutional networks In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1431\u20131439 (2015)","DOI":"10.1109\/ICCV.2015.168"},{"key":"8_CR23","doi-asserted-by":"crossref","unstructured":"Izadinia, H., Russell, B.C., Farhadi, A., Hoffman, M.D., Hertzmann, A.: Deep classifiers from image tags in the wild. In: Proceedings of the 2015 Workshop on Community-Organized Multimodal Mining: Opportunities for Novel Solutions, pp. 13\u201318. ACM (2015)","DOI":"10.1145\/2814815.2814821"},{"key":"8_CR24","doi-asserted-by":"crossref","unstructured":"Joulin, A., van der Maaten, L., Jabri, A., Vasilache, N.: Learning visual features from large weakly supervised data. arXiv preprint arXiv:1511.02251 (2015)","DOI":"10.1007\/978-3-319-46478-7_5"},{"key":"8_CR25","unstructured":"Kingma, D.P., Mohamed, S., Rezende, D.J., Welling, M.: Semi-supervised learning with deep generative models. In: Advances in Neural Information Processing Systems, pp. 3581\u20133589 (2014)"},{"key":"8_CR26","unstructured":"Burda, Y., Grosse, R., Salakhutdinov, R.: Importance weighted autoencoders. arXiv preprint arXiv:1509.00519 (2015)"},{"key":"8_CR27","unstructured":"Fabius, O., van Amersfoort, J.R.: Variational recurrent auto-encoders. arXiv preprint arXiv:1412.6581 (2014)"},{"key":"8_CR28","unstructured":"Gregor, K., Danihelka, I., Graves, A., Wierstra, D.: Draw: A recurrent neural network for image generation. arXiv preprint arXiv:1502.04623 (2015)"},{"key":"8_CR29","unstructured":"Chung, J., Kastner, K., Dinh, L., Goel, K., Courville, A.C., Bengio, Y.: A recurrent latent variable model for sequential data. In: Advances in Neural Information Processing Systems, pp. 2962\u20132970 (2015)"},{"key":"8_CR30","unstructured":"Kulkarni, T.D., Whitney, W.F., Kohli, P., Tenenbaum, J.: Deep convolutional inverse graphics network. In: Cortes, C., Lawrence, N.D., Lee, D.D., Sugiyama, M., Garnett, R. (eds.): Advances in Neural Information Processing Systems, vol. 28, pp. 2539\u20132547. Curran Associates, Inc., New York (2015)"},{"key":"8_CR31","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. In: International Conference on Learning Representations (2015)"},{"key":"8_CR32","doi-asserted-by":"crossref","unstructured":"Xie, S., Tu, Z.: Holistically-nested edge detection. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1395\u20131403 (2015)","DOI":"10.1109\/ICCV.2015.164"},{"key":"8_CR33","unstructured":"Nair, V., Hinton, G.E.: Rectified linear units improve restricted boltzmann machines. In: Proceedings of the 27th International Conference on Machine Learning (ICML 2010), pp. 807\u2013814 (2010)"},{"key":"8_CR34","unstructured":"Chen, X., Fang, H., Lin, T.Y., Vedantam, R., Gupta, S., Doll\u00e1r, P., Zitnick, C.L.: Microsoft COCO captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325 (2015)"},{"key":"8_CR35","unstructured":"Zhang, C., Platt, J.C., Viola, P.A.: Multiple instance boosting for object detection. In: Advances in Neural Information Processing Systems, pp. 1417\u20131424 (2005)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ACCV 2016"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-54190-7_8","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,9,19]],"date-time":"2019-09-19T16:31:06Z","timestamp":1568910666000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-319-54190-7_8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017]]},"ISBN":["9783319541891","9783319541907"],"references-count":35,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-54190-7_8","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2017]]}}}