{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T02:55:15Z","timestamp":1743044115239,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":34,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819723898"},{"type":"electronic","value":"9789819723904"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-981-97-2390-4_9","type":"book-chapter","created":{"date-parts":[[2024,4,27]],"date-time":"2024-04-27T18:02:02Z","timestamp":1714240922000},"page":"117-131","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["PAEE: Parameter-Efficient and\u00a0Data-Effective Image Captioning Model with\u00a0Knowledge Prompter and\u00a0Cross-Modal Representation Aligner"],"prefix":"10.1007","author":[{"given":"Yunji","family":"Tian","sequence":"first","affiliation":[]},{"given":"Zhiming","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Quan","family":"Zou","sequence":"additional","affiliation":[]},{"given":"Geng","family":"Chen","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,4,28]]},"reference":[{"key":"9_CR1","doi-asserted-by":"crossref","unstructured":"Agrawal, H., et al.: nocaps: novel object captioning at scale. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 8948\u20138957 (2019)","DOI":"10.1109\/ICCV.2019.00904"},{"key":"9_CR2","first-page":"23716","volume":"35","author":"JB Alayrac","year":"2022","unstructured":"Alayrac, J.B., et al.: Flamingo: a visual language model for few-shot learning. Adv. Neural. Inf. Process. Syst. 35, 23716\u201323736 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"9_CR3","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"382","DOI":"10.1007\/978-3-319-46454-1_24","volume-title":"Computer Vision \u2013 ECCV 2016","author":"P Anderson","year":"2016","unstructured":"Anderson, P., Fernando, B., Johnson, M., Gould, S.: SPICE: semantic propositional image caption evaluation. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9909, pp. 382\u2013398. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46454-1_24"},{"key":"9_CR4","unstructured":"Byeon, M., Park, B., Kim, H., Lee, S., Baek, W., Kim, S.: COYO-700M: image-text pair dataset (2022)"},{"key":"9_CR5","doi-asserted-by":"crossref","unstructured":"Changpinyo, S., Sharma, P., Ding, N., Soricut, R.: Conceptual 12M: pushing web-scale image-text pre-training to recognize long-tail visual concepts. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3558\u20133568 (2021)","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"9_CR6","unstructured":"Chen, X., et al.: Microsoft COCO captions: data collection and evaluation server. arXiv preprint arXiv:1504.00325 (2015)"},{"key":"9_CR7","unstructured":"Cho, J., Lei, J., Tan, H., Bansal, M.: Unifying vision-and-language tasks via text generation. In: International Conference on Machine Learning, pp. 1931\u20131942. PMLR (2021)"},{"key":"9_CR8","doi-asserted-by":"crossref","unstructured":"Dai, W., Hou, L., Shang, L., Jiang, X., Liu, Q., Fung, P.: Enabling multimodal generation on CLIP via vision-language knowledge distillation. arXiv preprint arXiv:2203.06386 (2022)","DOI":"10.18653\/v1\/2022.findings-acl.187"},{"key":"9_CR9","doi-asserted-by":"crossref","unstructured":"Denkowski, M., Lavie, A.: Meteor Universal: language specific translation evaluation for any target language. In: Proceedings of the Ninth Workshop on Statistical Machine Translation, pp. 376\u2013380 (2014)","DOI":"10.3115\/v1\/W14-3348"},{"key":"9_CR10","doi-asserted-by":"crossref","unstructured":"Eichenberg, C., Black, S., Weinbach, S., Parcalabescu, L., Frank, A.: MAGMA\u2013multimodal augmentation of generative models through adapter-based finetuning. arXiv preprint arXiv:2112.05253 (2021)","DOI":"10.18653\/v1\/2022.findings-emnlp.179"},{"key":"9_CR11","doi-asserted-by":"crossref","unstructured":"Gao, T., Fisch, A., Chen, D.: Making pre-trained language models better few-shot learners. arXiv preprint arXiv:2012.15723 (2020)","DOI":"10.18653\/v1\/2021.acl-long.295"},{"key":"9_CR12","doi-asserted-by":"crossref","unstructured":"Gurari, D., et al.: VizWiz Grand Challenge: answering visual questions from blind people. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3608\u20133617 (2018)","DOI":"10.1109\/CVPR.2018.00380"},{"key":"9_CR13","doi-asserted-by":"publisher","unstructured":"Gurari, D., Zhao, Y., Zhang, M., Bhattacharya, N.: Captioning images taken by people who are blind. In: Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XVII 16, pp. 417\u2013434. Springer (2020). https:\/\/doi.org\/10.1007\/978-3-030-58520-4_25","DOI":"10.1007\/978-3-030-58520-4_25"},{"key":"9_CR14","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., et al.: Visual Genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vis. 123, 32\u201373 (2017)","journal-title":"Int. J. Comput. Vis."},{"key":"9_CR15","unstructured":": Lewis, P., et al.: Retrieval-augmented generation for knowledge-intensive NLP tasks. Adv. Neural. Inf. Process. Syst. 33, 9459\u20139474 (2020)"},{"key":"9_CR16","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning, pp. 12888\u201312900. PMLR (2022)"},{"key":"9_CR17","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"121","DOI":"10.1007\/978-3-030-58577-8_8","volume-title":"Computer Vision \u2013 ECCV 2020","author":"X Li","year":"2020","unstructured":"Li, X., et al.: OSCAR: object-semantics aligned pre-training for vision-language tasks. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12375, pp. 121\u2013137. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_8"},{"key":"9_CR18","doi-asserted-by":"crossref","unstructured":"Luo, Z., Hu, Z., Xi, Y., Zhang, R., Ma, J.: I-Tuning: tuning frozen language models with image for lightweight image captioning (2023)","DOI":"10.1109\/ICASSP49357.2023.10096424"},{"key":"9_CR19","unstructured":"Mokady, R., Hertz, A., Bermano, A.H.: ClipCap: CLIP prefix for image captioning. arXiv preprint arXiv:2111.09734 (2021)"},{"key":"9_CR20","unstructured":"Ordonez, V., Kulkarni, G., Berg, T.: Im2Text: describing images using 1 million captioned photographs. Adv. Neural Inf. Proc. Syst. 24 (2011)"},{"key":"9_CR21","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: BLEU: a method for automatic evaluation of machine translation. In: Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"9_CR22","doi-asserted-by":"crossref","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., Caicedo, J.C., Hockenmaier, J., Lazebnik, S.: Flickr30k Entities: collecting region-to-phrase correspondences for richer image-to-sentence models. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2641\u20132649 (2015)","DOI":"10.1109\/ICCV.2015.303"},{"key":"9_CR23","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"issue":"8","key":"9_CR24","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I., et al.: Language models are unsupervised multitask learners. OpenAI blog 1(8), 9 (2019)","journal-title":"OpenAI blog"},{"key":"9_CR25","doi-asserted-by":"crossref","unstructured":"Ramos, R., Martins, B., Elliott, D., Kementchedjhieva, Y.: SmallCap: lightweight image captioning prompted with retrieval augmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2840\u20132849 (2023)","DOI":"10.1109\/CVPR52729.2023.00278"},{"key":"9_CR26","doi-asserted-by":"crossref","unstructured":"Sarto, S., Cornia, M., Baraldi, L., Cucchiara, R.: Retrieval-augmented transformer for image captioning. In: Proceedings of the 19th International Conference on Content-based Multimedia Indexing, pp. 1\u20137 (2022)","DOI":"10.1145\/3549555.3549585"},{"key":"9_CR27","doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., Soricut, R.: Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 2556\u20132565 (2018)","DOI":"10.18653\/v1\/P18-1238"},{"key":"9_CR28","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"742","DOI":"10.1007\/978-3-030-58536-5_44","volume-title":"Computer Vision \u2013 ECCV 2020","author":"O Sidorov","year":"2020","unstructured":"Sidorov, O., Hu, R., Rohrbach, M., Singh, A.: TextCaps: a dataset for image captioning with reading comprehension. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12347, pp. 742\u2013758. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58536-5_44"},{"key":"9_CR29","first-page":"200","volume":"34","author":"M Tsimpoukelli","year":"2021","unstructured":"Tsimpoukelli, M., Menick, J.L., Cabi, S., Eslami, S., Vinyals, O., Hill, F.: Multimodal few-shot learning with frozen language models. Adv. Neural. Inf. Process. Syst. 34, 200\u2013212 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"9_CR30","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Zitnick, C.L., Parikh, D.: CIDEr: consensus-based image description evaluation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"9_CR31","doi-asserted-by":"publisher","unstructured":"Xia, Q., et al.: XGPT: cross-modal generative pre-training for image captioning. In: Natural Language Processing and Chinese Computing: 10th CCF International Conference, NLPCC 2021, Qingdao, China, October 13\u201317, 2021, Proceedings, Part I 10, pp. 786\u2013797. Springer (2021). https:\/\/doi.org\/10.1007\/978-3-030-88480-2_63","DOI":"10.1007\/978-3-030-88480-2_63"},{"key":"9_CR32","doi-asserted-by":"crossref","unstructured":"Xu, C., Zhao, W., Yang, M., Ao, X., Cheng, W., Tian, J.: A unified generation-retrieval framework for image captioning. In: Proceedings of the 28th ACM International Conference on Information and Knowledge Management, pp. 2313\u20132316 (2019)","DOI":"10.1145\/3357384.3358105"},{"key":"9_CR33","doi-asserted-by":"publisher","unstructured":"Yang, Z., et al.: UniTAB: unifying text and box outputs for grounded vision-language modeling. In: Computer Vision\u2013ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23\u201327, 2022, Proceedings, Part XXXVI, pp. 521\u2013539. Springer (2022). https:\/\/doi.org\/10.1007\/978-3-031-20059-5_30","DOI":"10.1007\/978-3-031-20059-5_30"},{"key":"9_CR34","unstructured":"Zhang, S., et al.: OPT: open pre-trained transformer language models. arXiv preprint arXiv:2205.01068 (2022)"}],"container-title":["Lecture Notes in Computer Science","Web and Big Data"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-97-2390-4_9","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,4,27]],"date-time":"2024-04-27T18:14:09Z","timestamp":1714241649000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-97-2390-4_9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9789819723898","9789819723904"],"references-count":34,"URL":"https:\/\/doi.org\/10.1007\/978-981-97-2390-4_9","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"28 April 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"APWeb-WAIM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Asia-Pacific Web (APWeb) and Web-Age Information Management (WAIM) Joint International Conference on Web and Big Data","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Wuhan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"6 October 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 October 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"apwebwaim2023","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/www.apweb-waim2023.com\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}