{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T19:11:12Z","timestamp":1775243472365,"version":"3.50.1"},"publisher-location":"Cham","reference-count":55,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030012458","type":"print"},{"value":"9783030012465","type":"electronic"}],"license":[{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018]]},"DOI":"10.1007\/978-3-030-01246-5_31","type":"book-chapter","created":{"date-parts":[[2018,10,5]],"date-time":"2018-10-05T16:14:56Z","timestamp":1538756096000},"page":"519-535","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":46,"title":["Unpaired Image Captioning by Language Pivoting"],"prefix":"10.1007","author":[{"given":"Jiuxiang","family":"Gu","sequence":"first","affiliation":[]},{"given":"Shafiq","family":"Joty","sequence":"additional","affiliation":[]},{"given":"Jianfei","family":"Cai","sequence":"additional","affiliation":[]},{"given":"Gang","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2018,10,6]]},"reference":[{"key":"31_CR1","unstructured":"Bahdanau, D., Cho, K., Bengio, Y.: Neural machine translation by jointly learning to align and translate. In: ICLR (2015)"},{"key":"31_CR2","unstructured":"Bengio, S., Vinyals, O., Jaitly, N., Shazeer, N.: Scheduled sampling for sequence prediction with recurrent neural networks. In: NIPS, pp. 1171\u20131179 (2015)"},{"key":"31_CR3","unstructured":"Bertoldi, N., Barbaiani, M., Federico, M., Cattoni, R.: Phrase-based statistical machine translation with pivot languages. In: IWSLT, pp. 143\u2013149 (2008)"},{"key":"31_CR4","unstructured":"Cer, D., Manning, C.D., Jurafsky, D.: The best lexical metric for phrase-based statistical mt system optimization. In: NAACL, pp. 555\u2013563 (2010)"},{"key":"31_CR5","doi-asserted-by":"crossref","unstructured":"Chen, T.H., Liao, Y.H., Chuang, C.Y., Hsu, W.T., Fu, J., Sun, M.: Show, adapt and tell: adversarial training of cross-domain image captioner. In: ICCV, pp. 521\u2013530 (2017)","DOI":"10.1109\/ICCV.2017.64"},{"key":"31_CR6","doi-asserted-by":"crossref","unstructured":"Chen, Y., Liu, Y., Li, V.O.: Zero-resource neural machine translation with multi-agent communication game. In: AAAI, pp. 5086\u20135093 (2018)","DOI":"10.1609\/aaai.v32i1.11976"},{"key":"31_CR7","doi-asserted-by":"crossref","unstructured":"Cheng, Y., et al.: Semi-supervised learning for neural machine translation. In: ACL, pp. 1965\u20131974 (2016)","DOI":"10.18653\/v1\/P16-1185"},{"key":"31_CR8","doi-asserted-by":"crossref","unstructured":"Cheng, Y., Yang, Q., Liu, Y., Sun, M., Xu, W.: Joint training for pivot-based neural machine translation. In: IJCAI, pp. 3974\u20133980 (2017)","DOI":"10.24963\/ijcai.2017\/555"},{"key":"31_CR9","doi-asserted-by":"crossref","unstructured":"Cho, K., et al.: Learning phrase representations using rnn encoder-decoder for statistical machine translation, pp. 1724\u20131734 (2014)","DOI":"10.3115\/v1\/D14-1179"},{"key":"31_CR10","doi-asserted-by":"crossref","unstructured":"Cohn, T., Hoang, C.D.V., Vymolova, E., Yao, K., Dyer, C., Haffari, G.: Incorporating structural alignment biases into an attentional neural translation model. In: ACL, pp. 876\u2013885 (2016)","DOI":"10.18653\/v1\/N16-1102"},{"key":"31_CR11","unstructured":"Cohn, T., Lapata, M.: Machine translation by triangulation: making effective use of multi-parallel corpora. In: ACL, pp. 728\u2013735 (2007)"},{"key":"31_CR12","doi-asserted-by":"crossref","unstructured":"Denkowski, M., Lavie, A.: Meteor universal: language specific translation evaluation for any target language. In: ACL, pp. 376\u2013380 (2014)","DOI":"10.3115\/v1\/W14-3348"},{"key":"31_CR13","doi-asserted-by":"crossref","unstructured":"Ding, H., Jiang, X., Shuai, B., Liu, A.Q., Wang, G.: Context contrasted feature and gated multi-scale aggregation for scene segmentation. In: CVPR, pp. 2393\u20132402 (2018)","DOI":"10.1109\/CVPR.2018.00254"},{"key":"31_CR14","unstructured":"El Kholy, A., Habash, N., Leusch, G., Matusov, E., Sawaf, H.: Language independent connectivity strength features for phrase pivot statistical machine translation. In: ACL, pp. 412\u2013418 (2013)"},{"key":"31_CR15","doi-asserted-by":"crossref","unstructured":"Fang, H., et al.: From captions to visual concepts and back. In: CVPR, pp. 1473\u20131482 (2015)","DOI":"10.1109\/CVPR.2015.7298754"},{"key":"31_CR16","doi-asserted-by":"crossref","unstructured":"Firat, O., Sankaran, B., Al-Onaizan, Y., Vural, F.T.Y., Cho, K.: Zero-resource translation with multi-lingual neural machine translation. In: EMNLP, pp. 268\u2013277 (2016)","DOI":"10.18653\/v1\/D16-1026"},{"key":"31_CR17","doi-asserted-by":"crossref","unstructured":"Gu, J., Cai, J., Joty, S., Niu, L., Wang, G.: Look, imagine and match: improving textual-visual cross-modal retrieval with generative models. In: CVPR, pp. 7181\u20137189 (2018)","DOI":"10.1109\/CVPR.2018.00750"},{"key":"31_CR18","doi-asserted-by":"crossref","unstructured":"Gu, J., Cai, J., Wang, G., Chen, T.: Stack-captioning: coarse-to-fine learning for image captioning. In: AAAI, pp. 6837\u20136844 (2018)","DOI":"10.1609\/aaai.v32i1.12266"},{"key":"31_CR19","doi-asserted-by":"crossref","unstructured":"Gu, J., Wang, G., Cai, J., Chen, T.: An empirical study of language CNN for image captioning. In: ICCV, pp. 1222\u20131231 (2017)","DOI":"10.1109\/ICCV.2017.138"},{"key":"31_CR20","doi-asserted-by":"publisher","first-page":"354","DOI":"10.1016\/j.patcog.2017.10.013","volume":"77","author":"J Gu","year":"2017","unstructured":"Gu, J.: Recent advances in convolutional neural networks. Pattern Recognit. 77, 354\u2013377 (2017)","journal-title":"Pattern Recognit."},{"key":"31_CR21","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"31_CR22","doi-asserted-by":"crossref","unstructured":"Hitschler, J., Schamoni, S., Riezler, S.: Multimodal pivots for image caption translation. In: ACL, pp. 2399\u20132409 (2016)","DOI":"10.18653\/v1\/P16-1227"},{"key":"31_CR23","doi-asserted-by":"crossref","unstructured":"Jean, S., Cho, K., Memisevic, R., Bengio, Y.: On using very large target vocabulary for neural machine translation. In: ACL, pp. 1\u201310 (2015)","DOI":"10.3115\/v1\/P15-1001"},{"key":"31_CR24","doi-asserted-by":"crossref","unstructured":"Jia, X., Gavves, E., Fernando, B., Tuytelaars, T.: Guiding long-short term memory for image caption generation. In: ICCV, pp. 2407\u20132415 (2015)","DOI":"10.1109\/ICCV.2015.277"},{"key":"31_CR25","doi-asserted-by":"crossref","unstructured":"Johnson, M., et al.: Google\u2019s multilingual neural machine translation system: enabling zero-shot translation. In: TACL, pp. 339\u2013352 (2016)","DOI":"10.1162\/tacl_a_00065"},{"key":"31_CR26","doi-asserted-by":"crossref","unstructured":"Kalchbrenner, N., Blunsom, P.: Recurrent continuous translation models. In: EMNLP, pp. 1700\u20131709 (2013)","DOI":"10.18653\/v1\/D13-1176"},{"key":"31_CR27","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Fei-Fei, L.: Deep visual-semantic alignments for generating image descriptions. In: CVPR, pp. 3128\u20133137 (2015)","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"31_CR28","unstructured":"Kingma, D., Ba, J.: Adam: a method for stochastic optimization. In: ICLR (2015)"},{"key":"31_CR29","doi-asserted-by":"crossref","unstructured":"Kulkarni, G., et al.: Baby talk: understanding and generating image descriptions. In: CVPR, pp. 1601\u20131608 (2011)","DOI":"10.1109\/CVPR.2011.5995466"},{"key":"31_CR30","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"31_CR31","doi-asserted-by":"crossref","unstructured":"Liu, C., Sun, F., Wang, C., Wang, F., Yuille, A.: MAT: a multimodal attentive translator for image captioning. In: IJCAI, pp. 4033\u20134039 (2017)","DOI":"10.24963\/ijcai.2017\/563"},{"key":"31_CR32","doi-asserted-by":"crossref","unstructured":"Liu, S., Zhu, Z., Ye, N., Guadarrama, S., Murphy, K.: Improved image captioning via policy gradient optimization of spider. In: ICCV, pp. 873\u2013881 (2017)","DOI":"10.1109\/ICCV.2017.100"},{"key":"31_CR33","unstructured":"Luong, M.T., Le, Q.V., Sutskever, I., Vinyals, O., Kaiser, L.: Multi-task sequence to sequence learning. In: ICLR (2016)"},{"key":"31_CR34","doi-asserted-by":"crossref","unstructured":"Luong, M.T., Sutskever, I., Le, Q.V., Vinyals, O., Zaremba, W.: Addressing the rare word problem in neural machine translation. In: ACL, pp. 11\u201319 (2015)","DOI":"10.3115\/v1\/P15-1002"},{"key":"31_CR35","doi-asserted-by":"crossref","unstructured":"Mi, H., Sankaran, B., Wang, Z., Ittycheriah, A.: Coverage embedding models for neural machine translation. In: EMNLP, pp. 955\u2013960 (2016)","DOI":"10.18653\/v1\/D16-1096"},{"key":"31_CR36","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: BLEU: a method for automatic evaluation of machine translation. In: ACL, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"31_CR37","doi-asserted-by":"crossref","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., Caicedo, J.C., Hockenmaier, J., Lazebnik, S.: Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. In: ICCV, pp. 2641\u20132649 (2015)","DOI":"10.1109\/ICCV.2015.303"},{"key":"31_CR38","unstructured":"Ranzato, M., Chopra, S., Auli, M., Zaremba, W.: Sequence level training with recurrent neural networks. In: ICLR (2016)"},{"key":"31_CR39","doi-asserted-by":"crossref","unstructured":"Rennie, S.J., Marcheret, E., Mroueh, Y., Ross, J., Goel, V.: Self-critical sequence training for image captioning. In: CVPR, pp. 7008\u20137024 (2017)","DOI":"10.1109\/CVPR.2017.131"},{"key":"31_CR40","unstructured":"Sutskever, I., Vinyals, O., Le, Q.V.: Sequence to sequence learning with neural networks. In: NIPS, pp. 3104\u20133112 (2014)"},{"key":"31_CR41","doi-asserted-by":"crossref","unstructured":"Tu, Z., Lu, Z., Liu, Y., Liu, X., Li, H.: Modeling coverage for neural machine translation. In: ACL, pp. 76\u201385 (2016)","DOI":"10.18653\/v1\/P16-1008"},{"key":"31_CR42","unstructured":"Utiyama, M., Isahara, H.: A comparison of pivot methods for phrase-based statistical machine translation. In: NAACL, pp. 484\u2013491 (2007)"},{"key":"31_CR43","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence Zitnick, C., Parikh, D.: CIDEr: Consensus-based image description evaluation. In: CVPR, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"31_CR44","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: a neural image caption generator. In: CVPR, pp. 3156\u20133164 (2015)","DOI":"10.1109\/CVPR.2015.7298935"},{"issue":"4","key":"31_CR45","doi-asserted-by":"publisher","first-page":"652","DOI":"10.1109\/TPAMI.2016.2587640","volume":"39","author":"Oriol Vinyals","year":"2017","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: lessons learned from the 2015 MSCOCO image captioning challenge. In: PAMI, pp. 652\u2013663 (2017)","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"31_CR46","doi-asserted-by":"publisher","first-page":"165","DOI":"10.1007\/s10590-008-9041-6","volume":"21","author":"H Wu","year":"2007","unstructured":"Wu, H., Wang, H.: Pivot language approach for phrase-based statistical machine translation. Mach. Transl. 21, 165\u2013181 (2007)","journal-title":"Mach. Transl."},{"key":"31_CR47","unstructured":"Wu, J., et al.: AI challenger: a large-scale dataset for going deeper in image understanding. arXiv preprint arXiv:1711.06475 (2017)"},{"key":"31_CR48","doi-asserted-by":"crossref","unstructured":"Wu, Q., Shen, C., Liu, L., Dick, A., Hengel, A.V.D.: What value do explicit high level concepts have in vision to language problems? In: CVPR, pp. 203\u2013212 (2016)","DOI":"10.1109\/CVPR.2016.29"},{"key":"31_CR49","unstructured":"Xu, K., et al.: Show, attend and tell: neural image caption generation with visual attention. In: ICML, pp. 2048\u20132057 (2015)"},{"key":"31_CR50","doi-asserted-by":"crossref","unstructured":"Yang, X., Zhang, H., Cai, J.: Shuffle-then-assemble: learning object-agnostic visual relationship features. In: ECCV (2018)","DOI":"10.1007\/978-3-030-01258-8_3"},{"key":"31_CR51","doi-asserted-by":"crossref","unstructured":"Yao, T., Pan, Y., Li, Y., Qiu, Z., Mei, T.: Boosting image captioning with attributes. In: ICCV, pp. 22\u201329 (2017)","DOI":"10.1109\/ICCV.2017.524"},{"key":"31_CR52","doi-asserted-by":"crossref","unstructured":"You, Q., Jin, H., Wang, Z., Fang, C., Luo, J.: Image captioning with semantic attention. In: CVPR, pp. 4651\u20134659 (2016)","DOI":"10.1109\/CVPR.2016.503"},{"key":"31_CR53","unstructured":"Zahabi, S.T., Bakhshaei, S., Khadivi, S.: Using context vectors in improving a machine translation system with bridge language. In: ACL, pp. 318\u2013322 (2013)"},{"key":"31_CR54","doi-asserted-by":"crossref","unstructured":"Zhu, Y., et al.: Texygen: a benchmarking platform for text generation models. In: SIGIR, pp. 1097\u20131100 (2018)","DOI":"10.1145\/3209978.3210080"},{"key":"31_CR55","doi-asserted-by":"crossref","unstructured":"Zoph, B., Yuret, D., May, J., Knight, K.: Transfer learning for low-resource neural machine translation. In: EMNLP, pp. 1568\u20131575 (2016)","DOI":"10.18653\/v1\/D16-1163"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2018"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-01246-5_31","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T18:37:22Z","timestamp":1775241442000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-01246-5_31"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018]]},"ISBN":["9783030012458","9783030012465"],"references-count":55,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-01246-5_31","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018]]},"assertion":[{"value":"6 October 2018","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Munich","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2018","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 September 2018","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 September 2018","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2018","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2018.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}