{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,24]],"date-time":"2025-11-24T12:45:54Z","timestamp":1763988354374,"version":"3.37.3"},"reference-count":45,"publisher":"Springer Science and Business Media LLC","issue":"9","license":[{"start":{"date-parts":[[2022,7,22]],"date-time":"2022-07-22T00:00:00Z","timestamp":1658448000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,7,22]],"date-time":"2022-07-22T00:00:00Z","timestamp":1658448000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2022,9]]},"DOI":"10.1007\/s11263-022-01636-2","type":"journal-article","created":{"date-parts":[[2022,7,22]],"date-time":"2022-07-22T17:09:41Z","timestamp":1658509781000},"page":"2305-2320","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["Learning Cooperative Neural Modules for Stylized Image Captioning"],"prefix":"10.1007","volume":"130","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2056-6947","authenticated-orcid":false,"given":"Xinxiao","family":"Wu","sequence":"first","affiliation":[]},{"given":"Wentian","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Jiebo","family":"Luo","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,7,22]]},"reference":[{"key":"1636_CR1","doi-asserted-by":"crossref","unstructured":"Anderson, P., Fernando, B., Johnson, M., & Gould, S. (2016). Spice: Semantic propositional image caption evaluation. In European conference on computer vision, (pp. 382\u2013398), Springer","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"1636_CR2","doi-asserted-by":"crossref","unstructured":"Andrew\u00a0Shin, Y.U., & Harada, T. (2016). Image captioning with sentiment terms via weakly-supervised sentiment dataset. In C. Richard, E.R.H. Wilson, W.A.P. Smith (eds). Proceedings of the british machine vision conference (BMVC), (pp 53.1\u201353.12), BMVA Press","DOI":"10.5244\/C.30.53"},{"key":"1636_CR3","unstructured":"Banerjee, S., & Lavie, A. (2005). Meteor: An automatic metric for mt evaluation with improved correlation with human judgments. In Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization, (pp. 65\u201372)."},{"key":"1636_CR4","doi-asserted-by":"publisher","first-page":"8151","DOI":"10.1609\/aaai.v33i01.33018151","volume":"33","author":"CK Chen","year":"2019","unstructured":"Chen, C. K., Pan, Z., Liu, M. Y., & Sun, M. (2019). Unsupervised stylish image description generation via domain layer norm. Proceedings of the AAAI Conference on Artificial Intelligence, 33, 8151\u20138158.","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"key":"1636_CR5","doi-asserted-by":"crossref","unstructured":"Chen, T., Zhang, Z., You, Q., Fang, C., Wang, Z., Jin, H., & Luo, J. (2018). \u201cfactual\u201dor\u201cemotional\u201d: Stylized image captioning with adaptive learning and attention. In Proceedings of the european conference on computer vision (ECCV), (pp. 519\u2013535).","DOI":"10.1007\/978-3-030-01249-6_32"},{"key":"1636_CR6","unstructured":"Dethlefs, N., & Cuay\u00e1huitl, H. (2010). Hierarchical reinforcement learning for adaptive text generation. In Proceedings of the 6th international natural language generation conference, association for computational linguistics, (pp. 37\u201345)."},{"key":"1636_CR7","doi-asserted-by":"crossref","unstructured":"Diao, H., Zhang, Y., Ma, L., & Lu, H. (2021). Similarity reasoning and filtration for image-text matching. Technical Report","DOI":"10.1609\/aaai.v35i2.16209"},{"key":"1636_CR8","doi-asserted-by":"crossref","unstructured":"Fu, Z., Tan, X., Peng, N., Zhao, D., & Yan, R. (2018). Style transfer in text: exploration and evaluation. In Thirty-second AAAI conference on artificial intelligence, (pp. 663\u2013670).","DOI":"10.1609\/aaai.v32i1.11330"},{"key":"1636_CR9","doi-asserted-by":"crossref","unstructured":"Gan, C., Gan, Z., He, X., Gao, J., & Deng, L. (2017). Stylenet: Generating attractive visual captions with styles. In Proceedings of the IEEE conference on computer vision and pattern recognition, (pp. 3137\u20133146).","DOI":"10.1109\/CVPR.2017.108"},{"key":"1636_CR10","doi-asserted-by":"crossref","unstructured":"Gu, J., Cai, J., Wang, G., & Chen, T.(2018). Stack-captioning: Coarse-to-fine learning for image captioning. In Thirty-second AAAI conference on artificial intelligence, (pp. 6837\u20136844).","DOI":"10.1609\/aaai.v32i1.12266"},{"key":"1636_CR11","doi-asserted-by":"crossref","unstructured":"Guo, L., Liu, J., Lu, S., & Lu, H. (2019). Show, tell and polish: Ruminant decoding for image captioning. IEEE Transactions on Multimedia, 22(8), 2149\u20132162.","DOI":"10.1109\/TMM.2019.2951226"},{"key":"1636_CR12","doi-asserted-by":"crossref","unstructured":"Guo, L., Liu, J., Yao, P., Li, J., & Lu, H. (2019). Mscap: Multi-style image captioning with unpaired stylized text. In Proceedings of the IEEE conference on computer vision and pattern recognition, (pp. 4204\u20134213).","DOI":"10.1109\/CVPR.2019.00433"},{"key":"1636_CR13","doi-asserted-by":"crossref","unstructured":"Guo, L., Liu, J., Zhu, X., He, X., Jiang, J., & Lu, H. (2020). Non-autoregressive image captioning with counterfactuals-critical multi-agent learning. In Proceedings of the twenty-ninth international joint conference on artificial intelligence, (pp. 767\u2013773).","DOI":"10.24963\/ijcai.2020\/107"},{"key":"1636_CR14","doi-asserted-by":"publisher","unstructured":"Honnibal, M., Montani, I., Van\u00a0Landeghem, S., & Boyd, A. (2020). spaCy: Industrial-strength natural language processing in python. https:\/\/doi.org\/10.5281\/zenodo.1212303","DOI":"10.5281\/zenodo.1212303"},{"key":"1636_CR15","doi-asserted-by":"publisher","first-page":"8465","DOI":"10.1609\/aaai.v33i01.33018465","volume":"33","author":"Q Huang","year":"2019","unstructured":"Huang, Q., Gan, Z., Celikyilmaz, A., Wu, D., Wang, J., & He, X. (2019). Hierarchically structured reinforcement learning for topically coherent visual story generation. Proceedings of the AAAI Conference on Artificial Intelligence, 33, 8465\u20138472.","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"key":"1636_CR16","doi-asserted-by":"crossref","unstructured":"Johnson, J., Krishna, R., Stark, M., Li, L.J., Shamma, D., Bernstein, M., & Fei-Fei, L. (2015). Image retrieval using scene graphs. In Proceedings of the IEEE conference on computer vision and pattern recognition, (pp. 3668\u20133678).","DOI":"10.1109\/CVPR.2015.7298990"},{"key":"1636_CR17","doi-asserted-by":"publisher","unstructured":"Kim, Y. (2014). Convolutional neural networks for sentence classification. In Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP), association for computational linguistics, (pp. 1746\u20131751), Doha, Qatar, https:\/\/doi.org\/10.3115\/v1\/D14-1181","DOI":"10.3115\/v1\/D14-1181"},{"key":"1636_CR18","unstructured":"Kingma, D.P., & Ba, J. (2015). Adam: A method for stochastic optimization. In 3rd international conference on learning representations"},{"key":"1636_CR19","unstructured":"Kiros, R., Zhu, Y., Salakhutdinov, R.R., Zemel, R., Urtasun, R., Torralba, A., & Fidler, S. (2015). Skip-thought vectors. In Advances in neural information processing systems, (pp. 3294\u20133302)."},{"key":"1636_CR20","doi-asserted-by":"crossref","unstructured":"Kong, X., Xin, B., Wang, Y., & Hua, G. (2017). Collaborative deep reinforcement learning for joint object search. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, (pp. 1695\u20131704).","DOI":"10.1109\/CVPR.2017.748"},{"key":"1636_CR21","doi-asserted-by":"crossref","unstructured":"Krause, J., Johnson, J., Krishna, R., & Fei-Fei, L. (2017). A hierarchical approach for generating descriptive image paragraphs. In Proceedings of the IEEE conference on computer vision and pattern recognition, (pp. 317\u2013325).","DOI":"10.1109\/CVPR.2017.356"},{"issue":"8","key":"1636_CR22","doi-asserted-by":"publisher","first-page":"2117","DOI":"10.1109\/TMM.2019.2896516","volume":"21","author":"X Li","year":"2019","unstructured":"Li, X., & Jiang, S. (2019). Know more say less: Image captioning based on scene graphs. IEEE Transactions on Multimedia, 21(8), 2117\u20132130.","journal-title":"IEEE Transactions on Multimedia"},{"key":"1636_CR23","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., & Zitnick, C.L. (2014). Microsoft coco: Common objects in context. In European conference on computer vision, (pp. 740\u2013755), Springer.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"1636_CR24","doi-asserted-by":"crossref","unstructured":"Liu, C., He, S., Liu, K., & Zhao, J. (2019). Vocabulary pyramid network: Multi-pass encoding and decoding with multi-level vocabularies for response generation. In Proceedings of the 57th annual meeting of the association for computational linguistics, (pp. 3774\u20133783).","DOI":"10.18653\/v1\/P19-1367"},{"key":"1636_CR25","doi-asserted-by":"crossref","unstructured":"Mathews, A., Xie, L., & He, X. (2018). Semstyle: Learning to generate stylised image captions using unaligned text. In Proceedings of the IEEE conference on computer vision and pattern recognition, (pp. 8591\u20138600).","DOI":"10.1109\/CVPR.2018.00896"},{"key":"1636_CR26","doi-asserted-by":"crossref","unstructured":"Mathews, A.P., Xie, L., & He, X. (2016). Senticap: Generating image descriptions with sentiments. In Thirtieth AAAI conference on artificial intelligence, (pp. 3574\u20133580).","DOI":"10.1609\/aaai.v30i1.10475"},{"issue":"3","key":"1636_CR27","doi-asserted-by":"publisher","first-page":"387","DOI":"10.1007\/s10458-005-2631-2","volume":"11","author":"L Panait","year":"2005","unstructured":"Panait, L., & Luke, S. (2005). Cooperative multi-agent learning: The state of the art. Autonomous Agents and Multi-agent Systems, 11(3), 387\u2013434.","journal-title":"Autonomous Agents and Multi-agent Systems"},{"key":"1636_CR28","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., & Zhu, W.J. (2002). Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting on association for computational linguistics, Association for Computational Linguistics, (pp. 311\u2013318).","DOI":"10.3115\/1073083.1073135"},{"key":"1636_CR29","doi-asserted-by":"crossref","unstructured":"Peng, B., Li, X., Li, L., Gao, J., Celikyilmaz, A., Lee, S., & Wong, K.F. (2017). Composite task-completion dialogue policy learning via hierarchical deep reinforcement learning. In Proceedings of the 2017 conference on empirical methods in natural language processing, (pp. 2231\u20132240).","DOI":"10.18653\/v1\/D17-1237"},{"key":"1636_CR30","doi-asserted-by":"crossref","unstructured":"Rennie, S.J., Marcheret, E., Mroueh, Y., Ross, J., & Goel, V. (2017). Self-critical sequence training for image captioning. In Proceedings of the IEEE conference on computer vision and pattern recognition, (pp. 7008\u20137024).","DOI":"10.1109\/CVPR.2017.131"},{"issue":"6","key":"1636_CR31","first-page":"1503","volume":"37","author":"LR Slevc","year":"2011","unstructured":"Slevc, L. R. (2011). Saying what\u2019s on your mind: Working memory effects on sentence production. Journal of Experimental Psychology: Learning, Memory, and Cognition, 37(6), 1503.","journal-title":"Journal of Experimental Psychology: Learning, Memory, and Cognition"},{"key":"1636_CR32","doi-asserted-by":"crossref","unstructured":"Stolcke, A. (2002). Srilm-an extensible language modeling toolkit. In Proceedings of ICSLP, (pp. 901\u2013904).","DOI":"10.21437\/ICSLP.2002-303"},{"key":"1636_CR33","doi-asserted-by":"crossref","unstructured":"Sun, X., Lu, W. (2020). Understanding attention for text classification. In Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, Association for Computational Linguistics, (pp. 3418\u20133428).","DOI":"10.18653\/v1\/2020.acl-main.312"},{"key":"1636_CR34","volume-title":"Reinforcement learning: An introduction","author":"RS Sutton","year":"2018","unstructured":"Sutton, R. S., & Barto, A. G. (2018). Reinforcement learning: An introduction. Cambridge: MIT Press."},{"key":"1636_CR35","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence\u00a0Zitnick, C., Parikh, D. (2015). Cider: Consensus-based image description evaluation. In Proceedings of the IEEE conference on computer vision and pattern recognition, (pp. 4566\u20134575).","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"1636_CR36","doi-asserted-by":"crossref","unstructured":"Wang, X., Chen, W., Wu, J., Wang, Y.F., Yang\u00a0Wang, W. (2018). Video captioning via hierarchical reinforcement learning. In Proceedings of the IEEE conference on computer vision and pattern recognition, (pp. 4213\u20134222).","DOI":"10.1109\/CVPR.2018.00443"},{"issue":"3","key":"1636_CR37","first-page":"229","volume":"8","author":"RJ Williams","year":"1992","unstructured":"Williams, R. J. (1992). Simple statistical gradient-following algorithms for connectionist reinforcement learning. Machine Learning, 8(3), 229\u2013256.","journal-title":"Machine Learning"},{"issue":"3","key":"1636_CR38","doi-asserted-by":"publisher","first-page":"808","DOI":"10.1109\/TMM.2019.2931815","volume":"22","author":"L Wu","year":"2019","unstructured":"Wu, L., Xu, M., Wang, J., & Perry, S. (2019). Recall what you see continually using gridlstm in image captioning. IEEE Transactions on Multimedia, 22(3), 808\u2013818.","journal-title":"IEEE Transactions on Multimedia"},{"key":"1636_CR39","unstructured":"Xia, Y., Tian, F., Wu, L., Lin, J., Qin, T., Yu, N., & Liu, T.Y. (2017). Deliberation networks: Sequence generation beyond one-pass decoding. In Advances in neural information processing systems, (pp. 1784\u20131794)."},{"issue":"5","key":"1636_CR40","doi-asserted-by":"publisher","first-page":"1372","DOI":"10.1109\/TMM.2019.2941820","volume":"22","author":"N Xu","year":"2019","unstructured":"Xu, N., Zhang, H., Liu, A. A., Nie, W., Su, Y., Nie, J., & Zhang, Y. (2019). Multi-level policy and reward-based deep reinforcement learning framework for image captioning. IEEE Transactions on Multimedia, 22(5), 1372\u20131383.","journal-title":"IEEE Transactions on Multimedia"},{"key":"1636_CR41","doi-asserted-by":"crossref","unstructured":"Xu, W., Yu, J., Miao, Z., Wan, L., Tian, Y., Ji, Q. (2020). Deep reinforcement polishing network for video captioning. IEEE Transactions on Multimedia, 23, 1772\u20131784.","DOI":"10.1109\/TMM.2020.3002669"},{"key":"1636_CR42","doi-asserted-by":"crossref","unstructured":"Yang, X., Tang, K., Zhang, H., Cai, J. (2019). Auto-encoding scene graphs for image captioning. In Proceedings of the IEEE conference on computer vision and pattern recognition, (pp. 10685\u201310694).","DOI":"10.1109\/CVPR.2019.01094"},{"key":"1636_CR43","doi-asserted-by":"crossref","unstructured":"Zellers, R., Yatskar, M., Thomson, S., Choi, Y. (2018). Neural motifs: Scene graph parsing with global context. In Proceedings of the IEEE conference on computer vision and pattern recognition, (pp. 5831\u20135840).","DOI":"10.1109\/CVPR.2018.00611"},{"key":"1636_CR44","doi-asserted-by":"crossref","unstructured":"Zhao, W., Wu, X., & Zhang, X. (2020). Memcap: Memorizing style knowledge for image captioning. In The thirty-fourth AAAI conference on artificial intelligence, (pp. 12984\u201312992).","DOI":"10.1609\/aaai.v34i07.6998"},{"key":"1636_CR45","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Kiros, R., Zemel, R., Salakhutdinov, R., Urtasun, R., Torralba, A., & Fidler, S. (2015). Aligning books and movies: Towards story-like visual explanations by watching movies and reading books. In Proceedings of the IEEE international conference on computer vision, (pp. 19\u201327).","DOI":"10.1109\/ICCV.2015.11"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-022-01636-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-022-01636-2\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-022-01636-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,2,12]],"date-time":"2023-02-12T11:22:47Z","timestamp":1676200967000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-022-01636-2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,7,22]]},"references-count":45,"journal-issue":{"issue":"9","published-print":{"date-parts":[[2022,9]]}},"alternative-id":["1636"],"URL":"https:\/\/doi.org\/10.1007\/s11263-022-01636-2","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"type":"print","value":"0920-5691"},{"type":"electronic","value":"1573-1405"}],"subject":[],"published":{"date-parts":[[2022,7,22]]},"assertion":[{"value":"12 July 2021","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"26 May 2022","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 July 2022","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}