{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T16:03:44Z","timestamp":1743091424886,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":24,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819984282"},{"type":"electronic","value":"9789819984299"}],"license":[{"start":{"date-parts":[[2023,12,24]],"date-time":"2023-12-24T00:00:00Z","timestamp":1703376000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,12,24]],"date-time":"2023-12-24T00:00:00Z","timestamp":1703376000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-981-99-8429-9_25","type":"book-chapter","created":{"date-parts":[[2023,12,23]],"date-time":"2023-12-23T08:02:17Z","timestamp":1703318537000},"page":"309-320","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Plugging Stylized Controls in\u00a0Open-Stylized Image Captioning"],"prefix":"10.1007","author":[{"given":"Jie","family":"Wang","sequence":"first","affiliation":[]},{"given":"Yixiao","family":"Zheng","sequence":"additional","affiliation":[]},{"given":"Ruoyi","family":"Du","sequence":"additional","affiliation":[]},{"given":"Yiming","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Kongming","family":"Liang","sequence":"additional","affiliation":[]},{"given":"Zhanyu","family":"Ma","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,12,24]]},"reference":[{"key":"25_CR1","unstructured":"Chen, Q., Deng, C., Wu, Q.: Learning distinct and representative modes for image captioning. arXiv preprint arXiv:2209.08231 (2022)"},{"key":"25_CR2","doi-asserted-by":"crossref","unstructured":"Cho, J., Yoon, S., Kale, A., Dernoncourt, F., Bui, T., Bansal, M.: Fine-grained image captioning with clip reward. arXiv preprint arXiv:2205.13115 (2022)","DOI":"10.18653\/v1\/2022.findings-naacl.39"},{"key":"25_CR3","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"25_CR4","doi-asserted-by":"crossref","unstructured":"Fabbri, A.R., Li, I., She, T., Li, S., Radev, D.R.: Multi-news: a large-scale multi-document summarization dataset and abstractive hierarchical model. arXiv preprint arXiv:1906.01749 (2019)","DOI":"10.18653\/v1\/P19-1102"},{"key":"25_CR5","doi-asserted-by":"crossref","unstructured":"Gan, C., Gan, Z., He, X., Gao, J., Deng, L.: Stylenet: generating attractive visual captions with styles. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3137\u20133146 (2017)","DOI":"10.1109\/CVPR.2017.108"},{"key":"25_CR6","doi-asserted-by":"crossref","unstructured":"Guo, L., Liu, J., Yao, P., Li, J., Lu, H.: Mscap: multi-style image captioning with unpaired stylized text. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4204\u20134213 (2019)","DOI":"10.1109\/CVPR.2019.00433"},{"issue":"8","key":"25_CR7","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"25_CR8","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: International Conference on Machine Learning, pp. 4904\u20134916. PMLR (2021)"},{"issue":"7","key":"25_CR9","doi-asserted-by":"publisher","first-page":"1747","DOI":"10.1109\/TPAMI.2018.2836461","volume":"41","author":"K Liang","year":"2018","unstructured":"Liang, K., Chang, H., Ma, B., Shan, S., Chen, X.: Unifying visual attribute learning with object recognition in a multiplicative framework. IEEE Trans. Pattern Anal. Mach. Intell. 41(7), 1747\u20131760 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"25_CR10","doi-asserted-by":"crossref","unstructured":"Liang, K., Chang, H., Shan, S., Chen, X.: A unified multiplicative framework for attribute learning. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2506\u20132514 (2015)","DOI":"10.1109\/ICCV.2015.288"},{"key":"25_CR11","doi-asserted-by":"crossref","unstructured":"Liang, K., Guo, Y., Chang, H., Chen, X.: Visual relationship detection with deep structural ranking. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 32 (2018)","DOI":"10.1609\/aaai.v32i1.12274"},{"key":"25_CR12","doi-asserted-by":"crossref","unstructured":"Mathews, A., Xie, L., He, X.: Senticap: generating image descriptions with sentiments. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 30 (2016)","DOI":"10.1609\/aaai.v30i1.10475"},{"key":"25_CR13","unstructured":"Mokady, R., Hertz, A., Bermano, A.H.: Clipcap: clip prefix for image captioning. arXiv preprint arXiv:2111.09734 (2021)"},{"key":"25_CR14","doi-asserted-by":"crossref","unstructured":"Nukrai, D., Mokady, R., Globerson, A.: Text-only training for image captioning using noise-injected clip. arXiv preprint arXiv:2211.00575 (2022)","DOI":"10.18653\/v1\/2022.findings-emnlp.299"},{"key":"25_CR15","doi-asserted-by":"crossref","unstructured":"Pedersoli, M., Lucas, T., Schmid, C., Verbeek, J.: Areas of attention for image captioning. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1242\u20131250 (2017)","DOI":"10.1109\/ICCV.2017.140"},{"key":"25_CR16","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"issue":"8","key":"25_CR17","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I., et al.: Language models are unsupervised multitask learners. OpenAI Blog 1(8), 9 (2019)","journal-title":"OpenAI Blog"},{"key":"25_CR18","unstructured":"Su, Y., et al.: Language models can see: plugging visual controls in text generation. arXiv preprint arXiv:2205.02655 (2022)"},{"key":"25_CR19","doi-asserted-by":"crossref","unstructured":"Tewel, Y., Shalev, Y., Schwartz, I., Wolf, L.: Zerocap: zero-shot image-to-text generation for visual-semantic arithmetic. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 17918\u201317928 (2022)","DOI":"10.1109\/CVPR52688.2022.01739"},{"key":"25_CR20","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"issue":"12","key":"25_CR21","doi-asserted-by":"publisher","first-page":"4467","DOI":"10.1109\/TCSVT.2019.2947482","volume":"30","author":"J Yu","year":"2019","unstructured":"Yu, J., Li, J., Yu, Z., Huang, Q.: Multimodal transformer with multi-view visual representation for image captioning. IEEE Trans. Circuits Syst. Video Technol. 30(12), 4467\u20134480 (2019)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"25_CR22","doi-asserted-by":"crossref","unstructured":"Zeng, Z., Zhang, H., Lu, R., Wang, D., Chen, B., Wang, Z.: Conzic: controllable zero-shot image captioning by sampling-based polishing. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 23465\u201323476 (2023)","DOI":"10.1109\/CVPR52729.2023.02247"},{"key":"25_CR23","doi-asserted-by":"crossref","unstructured":"Zhao, W., Wu, X., Zhang, X.: Memcap: memorizing style knowledge for image captioning. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 34, pp. 12984\u201312992 (2020)","DOI":"10.1609\/aaai.v34i07.6998"},{"key":"25_CR24","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Long, G.: Style-aware contrastive learning for multi-style image captioning. arXiv preprint arXiv:2301.11367 (2023)","DOI":"10.18653\/v1\/2023.findings-eacl.169"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-99-8429-9_25","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,12,23]],"date-time":"2023-12-23T08:19:48Z","timestamp":1703319588000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-99-8429-9_25"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12,24]]},"ISBN":["9789819984282","9789819984299"],"references-count":24,"URL":"https:\/\/doi.org\/10.1007\/978-981-99-8429-9_25","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2023,12,24]]},"assertion":[{"value":"24 December 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Xiamen","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"13 October 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 October 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"6","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2023","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/prcv2023.xmu.edu.cn\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Microsoft CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1420","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"532","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"37% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3,78","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3,69","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}