{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,4]],"date-time":"2026-03-04T17:10:45Z","timestamp":1772644245749,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":35,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,13]],"date-time":"2023-10-13T00:00:00Z","timestamp":1697155200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,13]]},"DOI":"10.1145\/3640771.3640783","type":"proceedings-article","created":{"date-parts":[[2024,3,29]],"date-time":"2024-03-29T18:08:54Z","timestamp":1711735734000},"page":"50-55","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["CLIP-M-Cap: CLIP Mean Teacher for Image Captioning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-4596-6709","authenticated-orcid":false,"given":"Shuo","family":"Chen","sequence":"first","affiliation":[{"name":"Shanghai university of engineering science, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1558-1656","authenticated-orcid":false,"given":"Juan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Shanghai university of engineering science, China"}]}],"member":"320","published-online":{"date-parts":[[2024,3,29]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3295748"},{"key":"e_1_3_2_1_2_1","volume-title":"Show and tell: Lessons learned from the 2015 mscoco image captioning challenge[J]","author":"Vinyals O","year":"2016","unstructured":"Vinyals O, Toshev A, Bengio S, Show and tell: Lessons learned from the 2015 mscoco image captioning challenge[J]. IEEE transactions on pattern analysis and machine intelligence, 2016, 39(4): 652-663."},{"key":"e_1_3_2_1_3_1","volume-title":"Image captioning: a comprehensive survey[C]\/\/2020 International Conference on Power Electronics & IoT Applications in Renewable Energy and its Control (PARC)","author":"Sharma H","year":"2020","unstructured":"Sharma H, Agrahari M, Singh S K, Image captioning: a comprehensive survey[C]\/\/2020 International Conference on Power Electronics & IoT Applications in Renewable Energy and its Control (PARC). IEEE, 2020: 325-328."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1155\/2021\/5538927"},{"key":"e_1_3_2_1_5_1","volume-title":"Learning visual representation from modality-shared contrastive language-image pre-training[C]\/\/European Conference on Computer Vision","author":"You H","year":"2022","unstructured":"You H, Zhou L, Xiao B, Learning visual representation from modality-shared contrastive language-image pre-training[C]\/\/European Conference on Computer Vision. Cham: Springer Nature Switzerland, 2022: 69-87."},{"key":"e_1_3_2_1_6_1","volume-title":"Mean teachers are better role models: Weight-averaged consistency targets improve semi-supervised deep learning results[J]. Advances in neural information processing systems","author":"Tarvainen A","year":"2017","unstructured":"Tarvainen A, Valpola H. Mean teachers are better role models: Weight-averaged consistency targets improve semi-supervised deep learning results[J]. Advances in neural information processing systems, 2017, 30."},{"key":"e_1_3_2_1_7_1","volume-title":"CaMEL: mean teacher learning for image captioning[C]\/\/2022 26th International Conference on Pattern Recognition (ICPR)","author":"Barraco M","year":"2022","unstructured":"Barraco M, Stefanini M, Cornia M, CaMEL: mean teacher learning for image captioning[C]\/\/2022 26th International Conference on Pattern Recognition (ICPR). IEEE, 2022: 4087-4094."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Barraco M Cornia M Cascianelli S The unreasonable effectiveness of CLIP features for image captioning: an experimental analysis[C]\/\/proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2022: 4662-4670.","DOI":"10.1109\/CVPRW56347.2022.00512"},{"key":"e_1_3_2_1_9_1","volume-title":"A thorough review on recent deep learning methodologies for image captioning[J]. arXiv preprint arXiv:2107.13114","author":"Elhagry A","year":"2021","unstructured":"Elhagry A, Kadaoui K. A thorough review on recent deep learning methodologies for image captioning[J]. arXiv preprint arXiv:2107.13114, 2021."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4842-2845-6"},{"key":"e_1_3_2_1_11_1","volume-title":"A review of recurrent neural networks: LSTM cells and network architectures[J]. Neural computation","author":"Yu Y","year":"2019","unstructured":"Yu Y, Si X, Hu C, A review of recurrent neural networks: LSTM cells and network architectures[J]. Neural computation, 2019, 31(7): 1235-1270."},{"key":"e_1_3_2_1_12_1","volume-title":"PMLR","author":"Xu K","unstructured":"Xu K, Ba J, Kiros R, Show, attend and tell: Neural image caption generation with visual attention[C]\/\/International conference on machine learning. PMLR, 2015: 2048-2057."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Anderson P He X Buehler C Bottom-up and top-down attention for image captioning and visual question answering[C]\/\/Proceedings of the IEEE conference on computer vision and pattern recognition. 2018: 6077-6086.","DOI":"10.1109\/CVPR.2018.00636"},{"key":"e_1_3_2_1_14_1","volume-title":"Attention is all you need[J]. Advances in neural information processing systems","author":"Vaswani A","year":"2017","unstructured":"Vaswani A, Shazeer N, Parmar N, Attention is all you need[J]. Advances in neural information processing systems, 2017, 30."},{"key":"e_1_3_2_1_15_1","volume-title":"PMLR","author":"Parmar N","unstructured":"Parmar N, Vaswani A, Uszkoreit J, Image transformer[C]\/\/International conference on machine learning. PMLR, 2018: 4055-4064."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11633-022-1369-5"},{"key":"e_1_3_2_1_17_1","volume-title":"a method for automatic evaluation of machine translation[C]\/\/Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 2002: 311-318","author":"Papineni K","unstructured":"Papineni K, Roukos S, Ward T, Bleu: a method for automatic evaluation of machine translation[C]\/\/Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 2002: 311-318."},{"key":"e_1_3_2_1_18_1","unstructured":"Banerjee S Lavie A. METEOR: An automatic metric for MT evaluation with improved correlation with human judgments[C]\/\/Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 2005: 65-72."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"Vedantam R Lawrence Zitnick C Parikh D. Cider: Consensus-based image description evaluation[C]\/\/Proceedings of the IEEE conference on computer vision and pattern recognition. 2015: 4566-4575.","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_2_1_20_1","unstructured":"Lin C Y. Rouge: A package for automatic evaluation of summaries[C]\/\/Text summarization branches out. 2004: 74-81."},{"key":"e_1_3_2_1_21_1","volume-title":"The illustrated gpt-2 (visualizing transformer language models)[J]. Jalammar. github. io. https:\/\/jalammar. github. io\/illustrated-gpt2","author":"Alammar J.","year":"2019","unstructured":"Alammar J. The illustrated gpt-2 (visualizing transformer language models)[J]. Jalammar. github. io. https:\/\/jalammar. github. io\/illustrated-gpt2, 2019."},{"key":"e_1_3_2_1_22_1","volume-title":"Multi layer perceptron[J]. Machine Learning Lab Special Lecture","author":"Riedmiller M","year":"2014","unstructured":"Riedmiller M, Lernen A. Multi layer perceptron[J]. Machine Learning Lab Special Lecture, University of Freiburg, 2014: 7-24."},{"key":"e_1_3_2_1_23_1","volume-title":"Microsoft coco: Common objects in context[C]\/\/Computer Vision\u2013ECCV 2014: 13th European Conference","author":"Lin T Y","year":"2014","unstructured":"Lin T Y, Maire M, Belongie S, Microsoft coco: Common objects in context[C]\/\/Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13. Springer International Publishing, 2014: 740-755."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Plummer B A Wang L Cervantes C M Flickr30k entities: Collecting region-to-phrase correspondences for richer image-to-sentence models[C]\/\/Proceedings of the IEEE international conference on computer vision. 2015: 2641-2649.","DOI":"10.1109\/ICCV.2015.303"},{"key":"e_1_3_2_1_25_1","first-page":"1","volume-title":"ACM Transactions on Information Systems (TOIS)","author":"Wu H C","year":"2008","unstructured":"Wu H C, Luk R W P, Wong K F, Interpreting TF-IDF term weights as making relevance decisions[J]. ACM Transactions on Information Systems (TOIS), 2008, 26(3): 1-37."},{"key":"e_1_3_2_1_26_1","volume-title":"Johnson M","author":"Anderson P","year":"2016","unstructured":"Anderson P, Fernando B, Johnson M, Spice: Semantic propositional image caption evaluation[C]\/\/Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11-14, 2016, Proceedings, Part V 14. Springer International Publishing, 2016: 382-398."},{"key":"e_1_3_2_1_27_1","first-page":"87","article-title":"Programming with TensorFlow","volume":"2021","author":"Imambi S","unstructured":"Imambi S, Prakash K B, Kanagachidambaresan G R. PyTorch[J]. Programming with TensorFlow: Solution for Edge Computing Applications, 2021: 87-104.","journal-title":"Solution for Edge Computing Applications"},{"key":"e_1_3_2_1_28_1","volume-title":"Unified vision-language pre-training for image captioning and vqa[C]\/\/Proceedings of the AAAI conference on artificial intelligence","author":"Zhou L","year":"2020","unstructured":"Zhou L, Palangi H, Zhang L, Unified vision-language pre-training for image captioning and vqa[C]\/\/Proceedings of the AAAI conference on artificial intelligence. 2020, 34(07): 13041-13049."},{"key":"e_1_3_2_1_29_1","volume-title":"Bermano A H. Clipcap: Clip prefix for image captioning[J]. arXiv preprint arXiv:2111.09734","author":"Mokady R","year":"2021","unstructured":"Mokady R, Hertz A, Bermano A H. Clipcap: Clip prefix for image captioning[J]. arXiv preprint arXiv:2111.09734, 2021."},{"key":"e_1_3_2_1_30_1","first-page":"121","volume-title":"Proceedings, Part XXX 16","author":"Li X","year":"2020","unstructured":"Li X, Yin X, Li C, Oscar: Object-semantics aligned pre-training for vision-language tasks[C]\/\/Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XXX 16. Springer International Publishing, 2020: 121-137."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"Hu X Gan Z Wang J Scaling up vision-language pre-training for image captioning[C]\/\/Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2022: 17980-17989.","DOI":"10.1109\/CVPR52688.2022.01745"},{"key":"e_1_3_2_1_32_1","volume-title":"Language models can see: Plugging visual controls in text generation[J]. arXiv preprint arXiv:2205.02655","author":"Su Y","year":"2022","unstructured":"Su Y, Lan T, Liu Y, Language models can see: Plugging visual controls in text generation[J]. arXiv preprint arXiv:2205.02655, 2022."},{"key":"e_1_3_2_1_33_1","volume-title":"Zero-shot image-to-text generation for visual-semantic arithmetic[J]. arXiv preprint arXiv:2111.14447","author":"Tewel Y","year":"2021","unstructured":"Tewel Y, Shalev Y, Schwartz I, Zero-shot image-to-text generation for visual-semantic arithmetic[J]. arXiv preprint arXiv:2111.14447, 2021, 2."},{"key":"e_1_3_2_1_34_1","volume-title":"Text-only training for image captioning using noise-injected clip[J]. arXiv preprint arXiv:2211.00575","author":"Nukrai D","year":"2022","unstructured":"Nukrai D, Mokady R, Globerson A. Text-only training for image captioning using noise-injected clip[J]. arXiv preprint arXiv:2211.00575, 2022."},{"key":"e_1_3_2_1_35_1","volume-title":"PMLR","author":"Radford A","unstructured":"Radford A, Kim J W, Hallacy C, Learning transferable visual models from natural language supervision[C]\/\/International conference on machine learning. PMLR, 2021: 8748-8763."}],"event":{"name":"ISCAI 2023: 2023 2nd International Symposium on Computing and Artificial Intelligence","location":"Shanghai China","acronym":"ISCAI 2023"},"container-title":["Proceedings of the 2023 2nd International Symposium on Computing and Artificial Intelligence"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3640771.3640783","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3640771.3640783","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T02:30:45Z","timestamp":1755916245000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3640771.3640783"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,13]]},"references-count":35,"alternative-id":["10.1145\/3640771.3640783","10.1145\/3640771"],"URL":"https:\/\/doi.org\/10.1145\/3640771.3640783","relation":{},"subject":[],"published":{"date-parts":[[2023,10,13]]},"assertion":[{"value":"2024-03-29","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}