{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T23:15:55Z","timestamp":1751411755679,"version":"3.40.3"},"publisher-location":"Cham","reference-count":36,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031251979"},{"type":"electronic","value":"9783031251986"}],"license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023]]},"DOI":"10.1007\/978-3-031-25198-6_26","type":"book-chapter","created":{"date-parts":[[2023,2,10]],"date-time":"2023-02-10T07:32:38Z","timestamp":1676014358000},"page":"341-355","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["TraVL: Transferring Pre-trained Visual-Linguistic Models for\u00a0Cross-Lingual Image Captioning"],"prefix":"10.1007","author":[{"given":"Zhebin","family":"Zhang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Peng","family":"Lu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dawei","family":"Jiang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Gang","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2023,2,10]]},"reference":[{"key":"26_CR1","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Bottom-up and top-down attention for image captioning and visual question answering. In: CVPR, pp. 6077\u20136086 (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"26_CR2","doi-asserted-by":"publisher","unstructured":"Aneja, J., Deshpande, A., Schwing, A.G.: Convolutional image captioning. In: CVPR. pp. 5561\u20135570 (2018). https:\/\/doi.org\/10.1109\/CVPR.2018.00583","DOI":"10.1109\/CVPR.2018.00583"},{"key":"26_CR3","unstructured":"Chen, J., Guo, H., Yi, K., Li, B., Elhoseiny, M.: Visualgpt: Data-efficient image captioning by balancing visual input and linguistic knowledge from pretraining. CoRR abs\/2102.10407 (2021)"},{"key":"26_CR4","doi-asserted-by":"publisher","unstructured":"Cornia, M., Stefanini, M., Baraldi, L., Cucchiara, R.: Meshed-memory transformer for image captioning. In: CVPR, pp. 10575\u201310584 (2020). https:\/\/doi.org\/10.1109\/CVPR42600.2020.01059","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"26_CR5","doi-asserted-by":"publisher","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: Pre-training of deep bidirectional transformers for language understanding. In: NAACL, pp. 4171\u20134186 (2019). https:\/\/doi.org\/10.18653\/v1\/N19-1423","DOI":"10.18653\/v1\/N19-1423"},{"key":"26_CR6","unstructured":"Elliott, D., Frank, S., Hasler, E.: Multi-language image description with neural sequence models. CoRR abs\/1510.04709 (2015)"},{"key":"26_CR7","doi-asserted-by":"crossref","unstructured":"Farhadi, A., et al.: Every picture tells a story: Generating sentences from images. In: ECCV, pp. 15\u201329 (2010)","DOI":"10.1007\/978-3-642-15561-1_2"},{"key":"26_CR8","doi-asserted-by":"publisher","unstructured":"Freitag, M., Al-Onaizan, Y.: Beam search strategies for neural machine translation. In: NMT@ACL. pp. 56\u201360 (2017). https:\/\/doi.org\/10.18653\/v1\/w17-3207","DOI":"10.18653\/v1\/w17-3207"},{"key":"26_CR9","unstructured":"He, T., et al.: Layer-wise coordination between encoder and decoder for neural machine translation. In: NeurIPS, pp. 7955\u20137965 (2018)"},{"issue":"1","key":"26_CR10","doi-asserted-by":"publisher","first-page":"853","DOI":"10.1613\/jair.3994","volume":"47","author":"M Hodosh","year":"2013","unstructured":"Hodosh, M., Young, P., Hockenmaier, J.: Framing image description as a ranking task: data, models and evaluation metrics. JAIR 47(1), 853\u2013899 (2013)","journal-title":"JAIR"},{"key":"26_CR11","doi-asserted-by":"publisher","unstructured":"Huang, L., Wang, W., Chen, J., Wei, X.: Attention on attention for image captioning. In: ICCV, pp. 4633\u20134642 (2019). https:\/\/doi.org\/10.1109\/ICCV.2019.00473","DOI":"10.1109\/ICCV.2019.00473"},{"issue":"3","key":"26_CR12","doi-asserted-by":"publisher","first-page":"317","DOI":"10.1007\/s41019-020-00129-x","volume":"5","author":"K Ichikawa","year":"2020","unstructured":"Ichikawa, K., Tamano, H.: Unsupervised qualitative scoring for binary item features. Data Sci. Eng. 5(3), 317\u2013330 (2020)","journal-title":"Data Sci. Eng."},{"key":"26_CR13","doi-asserted-by":"publisher","unstructured":"Jawahar, G., Sagot, B., Seddah, D.: What does BERT learn about the structure of language? In: ACL, pp. 3651\u20133657 (2019). https:\/\/doi.org\/10.18653\/v1\/p19-1356","DOI":"10.18653\/v1\/p19-1356"},{"key":"26_CR14","unstructured":"Karpathy, A., Joulin, A., Fei-Fei, L.: Deep fragment embeddings for bidirectional image sentence mapping. In: NeurIPS, pp. 1889\u20131897 (2014)"},{"key":"26_CR15","doi-asserted-by":"crossref","unstructured":"Krishna, R., et al.: Visual genome: Connecting language and vision using crowdsourced dense image annotations. IJCV 123(1), 32\u201373 (2017)","DOI":"10.1007\/s11263-016-0981-7"},{"key":"26_CR16","doi-asserted-by":"publisher","unstructured":"Kulkarni, G., et al.: Baby talk: Understanding and generating simple image descriptions. In: CVPR, pp. 1601\u20131608 (2011). https:\/\/doi.org\/10.1109\/CVPR.2011.5995466","DOI":"10.1109\/CVPR.2011.5995466"},{"key":"26_CR17","doi-asserted-by":"publisher","unstructured":"Lan, W., Li, X., Dong, J.: Fluency-guided cross-lingual image captioning. In: ACM Multimedia, pp. 1549\u20131557 (2017). https:\/\/doi.org\/10.1145\/3123266.3123366","DOI":"10.1145\/3123266.3123366"},{"key":"26_CR18","doi-asserted-by":"publisher","unstructured":"Li, W., et al.: UNIMO: towards unified-modal understanding and generation via cross-modal contrastive learning. In: ACL-IJCNLP, pp. 2592\u20132607 (2021). https:\/\/doi.org\/10.18653\/v1\/2021.acl-long.202","DOI":"10.18653\/v1\/2021.acl-long.202"},{"issue":"9","key":"26_CR19","doi-asserted-by":"publisher","first-page":"2347","DOI":"10.1109\/TMM.2019.2896494","volume":"21","author":"X Li","year":"2019","unstructured":"Li, X., et al.: Coco-cn for cross-lingual image tagging, captioning, and retrieval. IEEE Multimedia 21(9), 2347\u20132360 (2019). https:\/\/doi.org\/10.1109\/TMM.2019.2896494","journal-title":"IEEE Multimedia"},{"key":"26_CR20","doi-asserted-by":"crossref","unstructured":"Li, X., et al.: Oscar: Object-semantics aligned pre-training for vision-language tasks. In: ECCV, pp. 121\u2013137 (2020)","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"26_CR21","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., et al.: Microsoft coco: Common objects in context. In: ECCV, pp. 740\u2013755 (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"26_CR22","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: ICLR (2019)"},{"key":"26_CR23","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In: NeurIPS, pp. 13\u201323 (2019)"},{"key":"26_CR24","doi-asserted-by":"publisher","unstructured":"Miyazaki, T., Shimizu, N.: Cross-lingual image caption generation. In: ACL, pp. 1780\u20131790 (2016). https:\/\/doi.org\/10.18653\/v1\/P16-1168","DOI":"10.18653\/v1\/P16-1168"},{"key":"26_CR25","volume-title":"Language models are unsupervised multitask learners","author":"A Radford","year":"2019","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I.: Language models are unsupervised multitask learners. Tech. rep, OpenAI (2019)"},{"key":"26_CR26","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: Towards real-time object detection with region proposal networks. In: NeurIPS, pp. 91\u201399 (2015)"},{"key":"26_CR27","doi-asserted-by":"publisher","unstructured":"Sharma, P., Ding, N., Goodman, S., Soricut, R.: Conceptual captions: A cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: ACL, pp. 2556\u20132565 (2018). https:\/\/doi.org\/10.18653\/v1\/P18-1238","DOI":"10.18653\/v1\/P18-1238"},{"key":"26_CR28","unstructured":"Su, W., et al.: Vl-bert: Pre-training of generic visual-linguistic representations. In: ICLR (2020)"},{"key":"26_CR29","unstructured":"Tsutsui, S., Crandall, D.J.: Using artificial tokens to control languages for multilingual image caption generation. CoRR abs\/1706.06275 (2017)"},{"key":"26_CR30","unstructured":"Vaswani, A., et al.: Attention is all you need. In: NeurIPS, pp. 6000\u20136010 (2017)"},{"key":"26_CR31","doi-asserted-by":"publisher","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: A neural image caption generator. In: CVPR, pp. 3156\u20133164 (2015). https:\/\/doi.org\/10.1109\/CVPR.2015.7298935","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"26_CR32","doi-asserted-by":"publisher","first-page":"104543","DOI":"10.1109\/ACCESS.2020.2999568","volume":"8","author":"B Wang","year":"2020","unstructured":"Wang, B., Wang, C., Zhang, Q., Su, Y., Wang, Y., Xu, Y.: Cross-lingual image caption generation based on visual attention model. IEEE Access 8, 104543\u2013104554 (2020). https:\/\/doi.org\/10.1109\/ACCESS.2020.2999568","journal-title":"IEEE Access"},{"key":"26_CR33","doi-asserted-by":"crossref","unstructured":"Weng, R., Yu, H., Huang, S., Cheng, S., Luo, W.: Acquiring knowledge from pre-trained model to neural machine translation. In: AAAI, pp. 9266\u20139273 (2020)","DOI":"10.1609\/aaai.v34i05.6465"},{"key":"26_CR34","unstructured":"Xu, K., et al.: Show, attend and tell: Neural image caption generation with visual attention. In: ICML. vol. 37, pp. 2048\u20132057 (2015)"},{"key":"26_CR35","unstructured":"Xu, L., Zhang, X., Dong, Q.: Cluecorpus 2020: A large-scale chinese corpus for pre-training language model. CoRR abs\/2003.01355 (2020)"},{"key":"26_CR36","doi-asserted-by":"crossref","unstructured":"Zhou, L., Palangi, H., Zhang, L., Hu, H., Corso, J.J., Gao, J.: Unified vision-language pre-training for image captioning and VQA. In: AAAI, pp. 13041\u201313049 (2020)","DOI":"10.1609\/aaai.v34i07.7005"}],"container-title":["Lecture Notes in Computer Science","Web and Big Data"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-25198-6_26","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,2,10]],"date-time":"2023-02-10T08:01:57Z","timestamp":1676016117000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-25198-6_26"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"ISBN":["9783031251979","9783031251986"],"references-count":36,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-25198-6_26","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2023]]},"assertion":[{"value":"10 February 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"APWeb-WAIM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Asia-Pacific Web (APWeb) and Web-Age Information Management (WAIM) Joint International Conference on Web and Big Data","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Nanjing","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 August 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"13 August 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"6","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"apwebwaim2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/apweb-waim2022.com\/proceedings","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"297","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"75","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"45","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"25% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5 Demo papers + 23 workshop papers","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}