{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T05:02:28Z","timestamp":1750309348387,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":57,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,7,12]],"date-time":"2024-07-12T00:00:00Z","timestamp":1720742400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,7,12]]},"DOI":"10.1145\/3686490.3686513","type":"proceedings-article","created":{"date-parts":[[2024,10,11]],"date-time":"2024-10-11T16:29:37Z","timestamp":1728664177000},"page":"151-158","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Hybrid Feature Based Global Variational Transformer for Diverse Image Captioning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-7112-8765","authenticated-orcid":false,"given":"Xiaodong","family":"Wang","sequence":"first","affiliation":[{"name":"Xuzhou Guanglian Technology Co,Ltd, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4231-9975","authenticated-orcid":false,"given":"Bo","family":"Xiang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, China University of Mining and Technology, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-3138-5175","authenticated-orcid":false,"given":"Weifeng","family":"Shi","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, China University of Mining and Technology, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,11]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2018.2867286"},{"key":"e_1_3_2_1_2_1","volume-title":"Proceedings of the Proc. Adv. Neural Inf. Process. Syst.","volume":"25","author":"Krizhevsky A.","year":"2012","unstructured":"Krizhevsky, A.; Sutskever, I.; Hinton, G. ImageNet classification with deep convolutional neural networks. In Proceedings of the Proc. Adv. Neural Inf. Process. Syst., 2012, Vol. 25."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_4_1","first-page":"91","article-title":"Towards real-time object detection with region proposal networks","volume":"28","author":"Ren S.","year":"2015","unstructured":"Ren, S.; He, K.; Girshick, R.; Sun, J. Faster r-cnn: Towards real-time object detection with region proposal networks. In Proceedings of the Proc. Adv. Neural Inf. Process. Syst., 2015, Vol. 28, pp. 91\u201399.","journal-title":"Proceedings of the Proc. Adv. Neural Inf. Process. Syst."},{"key":"e_1_3_2_1_5_1","volume-title":"SSD: Single Shot MultiBox Detector. In Proceedings of the Proc. IEEE Conf. Comput. Vis. Pattern Recognit.","author":"Liu W.","year":"2016","unstructured":"Liu, W.; Anguelov, D.; Erhan, D.; Szegedy, C.; Reed, S.; Fu, C.Y.; Berg, A.C. SSD: Single Shot MultiBox Detector. In Proceedings of the Proc. IEEE Conf. Comput. Vis. Pattern Recognit., 2016."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1179"},{"key":"e_1_3_2_1_7_1","first-page":"1709","volume-title":"Proceedings of the Proc. Conf. Empir. Methods Nat.Lang. Process.","author":"Kalchbrenner N.","year":"2013","unstructured":"Kalchbrenner, N.; Blunsom, P. Recurrent continuous translation models. In Proceedings of the Proc. Conf. Empir. Methods Nat.Lang. Process., 2013, pp. 1700\u20131709."},{"key":"e_1_3_2_1_8_1","first-page":"3104","article-title":"Sequence to sequence learning with neural networks","author":"Sutskever I.","year":"2014","unstructured":"Sutskever, I.; Vinyals, O.; Le, Q.V. Sequence to sequence learning with neural networks. In Proceedings of the Proc. Adv. Neural Inf. Process. Syst., 2014, pp. 3104\u20133112.","journal-title":"Proceedings of the Proc. Adv. Neural Inf. Process. Syst."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"e_1_3_2_1_10_1","first-page":"5998","article-title":"Attention is all you need","author":"Vaswani A.","year":"2017","unstructured":"Vaswani, A.; Shazeer, N.; Parmar, N.; Uszkoreit, J.; Jones, L.; Gomez, A.N.; Kaiser, \u0141.; Polosukhin, I. Attention is all you need. In Proceedings of the Proc. Adv. Neural Inf. Process. Syst., 2017, pp. 5998\u20136008.","journal-title":"Proceedings of the Proc. Adv. Neural Inf. Process. Syst."},{"key":"e_1_3_2_1_11_1","first-page":"755","volume-title":"Proceedings of the Proc. Eur. Conf. Comput. Vis.","author":"Lin T.Y.","year":"2014","unstructured":"Lin, T.Y.; Maire, M.; Belongie, S.; Hays, J.; Perona, P.; Ramanan, D.; Doll\u00e1r, P.; Zitnick, C.L. Microsoft coco: Common objects in context. In Proceedings of the Proc. Eur. Conf. Comput. Vis., 2014, pp. 740\u2013755."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.323"},{"key":"e_1_3_2_1_13_1","volume-title":"Generating Diverse and Accurate Visual Captions by Comparative Adversarial Learning. CoRR","author":"Li D.","year":"2018","unstructured":"Li, D.; Huang, Q.; He, X.; Zhang, L.; Sun, M. Generating Diverse and Accurate Visual Captions by Comparative Adversarial Learning. CoRR 2018, abs\/1804.00861."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.445"},{"key":"e_1_3_2_1_15_1","first-page":"10704","volume-title":"Part-Of-Speech. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2019","author":"Deshpande A.","year":"2019","unstructured":"Deshpande, A.; Aneja, J.; Wang, L.; Schwing, A.G.; Forsyth, D.A. Fast, Diverse and Accurate Image Captioning Guided by Part-Of-Speech. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2019, Long Beach,CA, USA, June 16-20, 2019, 2019, pp. 10695\u201310704."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12340"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00436"},{"key":"e_1_3_2_1_18_1","volume-title":"Proceedings of the 8th International Conference on Learning Representations, ICLR 2020","author":"Mahajan S.","year":"2020","unstructured":"Mahajan, S.; Gurevych, I.; Roth, S. Latent Normalizing Flows for Many-to-Many Cross-Domain Mappings. In Proceedings of the 8th International Conference on Learning Representations, ICLR 2020, Addis Ababa, Ethiopia, April 26-30, 2020, 2020."},{"key":"e_1_3_2_1_19_1","volume-title":"Proceedings of the Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020","author":"Mahajan S.","year":"2020","unstructured":"Mahajan, S.; Roth, S. Diverse Image Captioning with Context-Object Split Latent Spaces. In Proceedings of the Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6-12, 2020, virtual, 2020."},{"key":"e_1_3_2_1_20_1","volume-title":"Long short-term memory. Neural computation","author":"Hochreiter S.","year":"1997","unstructured":"Hochreiter, S.; Schmidhuber, J. Long short-term memory. Neural computation 1997."},{"key":"e_1_3_2_1_21_1","volume-title":"Very deep convolutional networks for large-scale image recognition","author":"Simonyan K.","year":"2015","unstructured":"Simonyan, K.; Zisserman, A. Very deep convolutional networks for large-scale image recognition. 2015."},{"key":"e_1_3_2_1_22_1","first-page":"10002","volume-title":"Proceedings of the 2021 IEEE\/CVF International Conference on Computer Vision, ICCV 2021, Montreal,QC","author":"Liu Z.","year":"2021","unstructured":"Liu, Z.; Lin, Y.; Cao, Y.; Hu, H.; Wei, Y.; Zhang, Z.; Lin, S.; Guo, B. Swin Transformer: Hierarchical Vision Transformer using Shifted Windows. In Proceedings of the 2021 IEEE\/CVF International Conference on Computer Vision, ICCV 2021, Montreal,QC, Canada, October 10-17, 2021, 2021, pp. 9992\u201310002."},{"key":"e_1_3_2_1_23_1","first-page":"5766","volume-title":"Proceedings of the Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017","author":"Wang L.","year":"2017","unstructured":"Wang, L.; Schwing, A.G.; Lazebnik, S. Diverse and Accurate Image Description Using a Variational Auto-Encoder with an Additive Gaussian Encoding Space. In Proceedings of the Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4-9, 2017, Long Beach, CA, USA, 2017, pp. 5756\u20135766."},{"key":"e_1_3_2_1_24_1","volume-title":"Proceedings of the 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings","author":"Mao J.","year":"2015","unstructured":"Mao, J.; Xu, W.; Yang, Y.; Wang, J.; Yuille, A.L. Deep Captioning with Multimodal Recurrent Neural Networks (m-RNN). In Proceedings of the 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings, 2015."},{"key":"e_1_3_2_1_25_1","first-page":"3112","volume-title":"Proceedings of the Proc. Adv.Neural Inf. Process. Syst.; Ghahramani, Z.; Welling, M.; Cortes, C.; Lawrence, N.D.; Weinberger, K.Q., Eds.","author":"Sutskever I.","year":"2014","unstructured":"Sutskever, I.; Vinyals, O.; Le, Q.V. Sequence to Sequence Learning with Neural Networks. In Proceedings of the Proc. Adv.Neural Inf. Process. Syst.; Ghahramani, Z.; Welling, M.; Cortes, C.; Lawrence, N.D.; Weinberger, K.Q., Eds., 2014, pp. 3104\u20133112."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"e_1_3_2_1_27_1","first-page":"2057","volume-title":"Proceedings of the Proc. Int. Conf. Mach. Learn.","author":"Xu K.","year":"2015","unstructured":"Xu, K.; Ba, J.; Kiros, R.; Cho, K.; Courville, A.; Salakhudinov, R.; Zemel, R.; Bengio, Y. Show, attend and tell: Neural image caption generation with visual attention. In Proceedings of the Proc. Int. Conf. Mach. Learn., 2015, pp. 2048\u20132057."},{"key":"e_1_3_2_1_28_1","volume-title":"Image captioning: Transforming objects into words. arXiv preprint arXiv:1906.05963","author":"Herdade S.","year":"2019","unstructured":"Herdade, S.; Kappeler, A.; Boakye, K.; Soares, J. Image captioning: Transforming objects into words. arXiv preprint arXiv:1906.05963 2019."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01094"},{"key":"e_1_3_2_1_30_1","volume-title":"Unifying Visual-Semantic Embeddings with Multimodal Neural Language Models.CoRR","author":"Kiros R.","year":"2014","unstructured":"Kiros, R.; Salakhutdinov, R.; Zemel, R.S. Unifying Visual-Semantic Embeddings with Multimodal Neural Language Models.CoRR 2014, abs\/1411.2539."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2599174"},{"key":"e_1_3_2_1_32_1","first-page":"21","article-title":"Multitask Learning for Cross-Doma","volume":"2019","author":"Yang M.","unstructured":"Yang, M.; Zhao, W.; Xu, W.; Feng, Y.; Zhao, Z.; Chen, X.; Lei, K. Multitask Learning for Cross-Domain Image Captioning. IEEE Trans. Multimedia 2019, 21, 1047\u20131061.","journal-title":"Image Captioning. IEEE Trans. Multimedia"},{"key":"e_1_3_2_1_33_1","first-page":"21","article-title":"Deep Hierarchical Encoder-Decoder Network for Image Captioning","volume":"2019","author":"Xiao X.","unstructured":"Xiao, X.; Wang, L.; Ding, K.; Xiang, S.; Pan, C. Deep Hierarchical Encoder-Decoder Network for Image Captioning. IEEE Trans. Multimedia 2019, 21, 2942\u20132956.","journal-title":"IEEE Trans. Multimedia"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2019.2947482"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3067449"},{"key":"e_1_3_2_1_36_1","first-page":"29","volume-title":"Proceedings of the Proc. Eur. Conf. Comput. Vis.","author":"Farhadi A.","year":"2010","unstructured":"Farhadi, A.; Hejrati, M.; Sadeghi, M.A.; Young, P.; Rashtchian, C.; Hockenmaier, J.; Forsyth, D. Every picture tells a story:Generating sentences from images. In Proceedings of the Proc. Eur. Conf. Comput. Vis., 2010, pp. 15\u201329."},{"key":"e_1_3_2_1_37_1","first-page":"1143","article-title":"Describing images using 1 million captioned photographs","volume":"24","author":"Ordonez V.","year":"2011","unstructured":"Ordonez, V.; Kulkarni, G.; Berg, T. Im2text: Describing images using 1 million captioned photographs. In Proceedings of the Proc.Adv. Neural Inf. Process. Syst., 2011, Vol. 24, pp. 1143\u20131151.","journal-title":"Proceedings of the Proc.Adv. Neural Inf. Process. Syst."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2012.162"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.306"},{"key":"e_1_3_2_1_40_1","first-page":"802","article-title":"Convolutional LSTM Network","author":"Shi X.","year":"2015","unstructured":"Shi, X.; Chen, Z.; Wang, H.; Yeung, D.; Wong, W.; Woo, W. Convolutional LSTM Network: A Machine Learning Approach for Precipitation Nowcasting. In Proceedings of the Proc. Adv. Neural Inf. Process. Syst., 2015, pp. 802\u2013810.","journal-title":"A Machine Learning Approach for Precipitation Nowcasting. In Proceedings of the Proc. Adv. Neural Inf. Process. Syst."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01034"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01098"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16258"},{"key":"e_1_3_2_1_45_1","first-page":"2293","volume-title":"R. Dual-level Collaborative Transformer for Image Captioning. In Proceedings of the Proc. AAAI Conf. Artif. Intell.","author":"Luo Y.","year":"2021","unstructured":"Luo, Y.; Ji, J.; Sun, X.; Cao, L.; Wu, Y.; Huang, F.; Lin, C.; Ji, R. Dual-level Collaborative Transformer for Image Captioning. In Proceedings of the Proc. AAAI Conf. Artif. Intell., 2021, pp. 2286\u20132293."},{"key":"e_1_3_2_1_46_1","first-page":"15474","volume-title":"R. RSTNet: Captioning With Adaptive Attention on Visual and Non-Visual Words. In Proceedings of the Proc. IEEE Conf. Comput. Vis. Pattern Recognit.","author":"Zhang X.","year":"2021","unstructured":"Zhang, X.; Sun, X.; Luo, Y.; Ji, J.; Zhou, Y.; Wu, Y.; Huang, F.; Ji, R. RSTNet: Captioning With Adaptive Attention on Visual and Non-Visual Words. In Proceedings of the Proc. IEEE Conf. Comput. Vis. Pattern Recognit., 2021, pp. 15465\u201315474."},{"key":"e_1_3_2_1_47_1","volume-title":"CPTR: Full Transformer Network for Image Captioning. CoRR","author":"Liu W.","year":"2021","unstructured":"Liu, W.; Chen, S.; Guo, L.; Zhu, X.; Liu, J. CPTR: Full Transformer Network for Image Captioning. CoRR 2021, abs\/2101.10804."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00902"},{"key":"e_1_3_2_1_49_1","first-page":"3491","volume-title":"Proceedings of the Advances in Neural Information Processing Systems 28: Annual Conference on Neural Information Processing Systems 2015","author":"Sohn K.","year":"2015","unstructured":"Sohn, K.; Lee, H.; Yan, X. Learning Structured Output Representation using Deep Conditional Generative Models. In Proceedings of the Advances in Neural Information Processing Systems 28: Annual Conference on Neural Information Processing Systems 2015, December 7-12, 2015, Montreal, Quebec, Canada, 2015, pp. 3483\u20133491."},{"key":"e_1_3_2_1_50_1","volume-title":"Proceedings of the 2nd International Conference on Learning Representations, ICLR 2014, Banff, AB, Canada, April 14-16, 2014, Conference Track Proceedings","author":"Kingma D.P.","year":"2014","unstructured":"Kingma, D.P.; Welling, M. Auto-Encoding Variational Bayes. In Proceedings of the 2nd International Conference on Learning Representations, ICLR 2014, Banff, AB, Canada, April 14-16, 2014, Conference Track Proceedings, 2014."},{"key":"e_1_3_2_1_51_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma D.P.","year":"2014","unstructured":"Kingma, D.P.; Ba, J. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 2014."},{"key":"e_1_3_2_1_52_1","first-page":"318","volume-title":"Proceedings of the Proc. Annu. Meeting Assoc. Comput. Linguistics","author":"Papineni K.","year":"2002","unstructured":"Papineni, K.; Roukos, S.; Ward, T.; Zhu, W.J. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the Proc. Annu. Meeting Assoc. Comput. Linguistics, 2002, pp. 311\u2013318."},{"key":"e_1_3_2_1_53_1","first-page":"231","volume-title":"B. METEOR: An Automatic Metric for MT Evaluation with Improved Correlation with Human Judgments. In Proceedings of the Proc. Annu. Meeting Assoc. Comput. Linguistics","author":"Satanjeev","year":"2005","unstructured":"Satanjeev, B. METEOR: An Automatic Metric for MT Evaluation with Improved Correlation with Human Judgments. In Proceedings of the Proc. Annu. Meeting Assoc. Comput. Linguistics, 2005, pp. 228\u2013231."},{"key":"e_1_3_2_1_54_1","volume-title":"Proceedings of the Proc. Annu. Meeting Assoc. Comput. Linguistics","author":"Szpakowicz S.","year":"2004","unstructured":"Szpakowicz, S. Text summarization branches out. In Proceedings of the Proc. Annu. Meeting Assoc. Comput. Linguistics, 2004."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_24"},{"volume-title":"Diverse image captioning via conditional variational autoencoder and dual contrastive learning[J]. ACM Transactions on Multimedia Com\u2043 puting, Communications, and Applications","year":"2024","key":"e_1_3_2_1_57_1","unstructured":"XU J, LIU B, ZHOU Y, Diverse image captioning via conditional variational autoencoder and dual contrastive learning[J]. ACM Transactions on Multimedia Com\u2043 puting, Communications, and Applications, 2024, 20 (1): 29."}],"event":{"name":"SPML 2024: 2024 7th International Conference on Signal Processing and Machine Learning","acronym":"SPML 2024","location":"Qingdao China"},"container-title":["Proceedings of the 2024 7th International Conference on Signal Processing and Machine Learning"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3686490.3686513","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3686490.3686513","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:05:39Z","timestamp":1750291539000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3686490.3686513"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,7,12]]},"references-count":57,"alternative-id":["10.1145\/3686490.3686513","10.1145\/3686490"],"URL":"https:\/\/doi.org\/10.1145\/3686490.3686513","relation":{},"subject":[],"published":{"date-parts":[[2024,7,12]]},"assertion":[{"value":"2024-10-11","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}