{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,8]],"date-time":"2026-06-08T18:03:40Z","timestamp":1780941820604,"version":"3.54.1"},"reference-count":49,"publisher":"Springer Science and Business Media LLC","issue":"13","license":[{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Vis Comput"],"published-print":{"date-parts":[[2025,10]]},"DOI":"10.1007\/s00371-025-04072-8","type":"journal-article","created":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T11:32:47Z","timestamp":1751369567000},"page":"10841-10855","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["MTFIC: enhanced fashion image captioning via multi-transformer architecture with contrastive and bidirectional encodings"],"prefix":"10.1007","volume":"41","author":[{"given":"Bui Thanh","family":"Hung","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Vo Quoc","family":"Huy","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2025,7,1]]},"reference":[{"key":"4072_CR1","doi-asserted-by":"crossref","unstructured":"Ge, Y., Zhang, R., Wang, X., Tang, X., Luo, P.: Deepfashion2: A versatile benchmark for detection, pose estimation, segmentation and re-identification of clothing images. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (2019)","DOI":"10.1109\/CVPR.2019.00548"},{"key":"4072_CR2","unstructured":"Li, J., Zhao, J., Wei, Y., Lang, C., Li, Y., Sim, T., Feng, J.: Multiple-human parsing in the wild. arXiv preprint arXiv:1705.07206. (2017)"},{"key":"4072_CR3","doi-asserted-by":"crossref","unstructured":"Huang, Q., Han, X., Lu, T., Liu, G.: Clothing image retrieval based on parts detection and segmentation. In: Proceedings of the 2021 3rd International Conference on Image Processing and Machine Vision (2021)","DOI":"10.1145\/3469951.3469961"},{"issue":"2","key":"4072_CR4","doi-asserted-by":"publisher","first-page":"355","DOI":"10.1111\/cgf.13643","volume":"38","author":"I Santesteban","year":"2019","unstructured":"Santesteban, I., Otaduy, M.A., Casas, D.: Learning-based animation of clothing for virtual try-on. Computer Graph. Forum 38(2), 355\u2013366 (2019)","journal-title":"Computer Graph. Forum"},{"issue":"1","key":"4072_CR5","doi-asserted-by":"publisher","first-page":"539","DOI":"10.1109\/TPAMI.2022.3148210","volume":"45","author":"M Stefanini","year":"2022","unstructured":"Stefanini, M., Cornia, M., Baraldi, L., Cascianelli, S., Fiameni, G., Cucchiara, R.: From show to tell: A survey on deep learning-based image captioning. IEEE Trans. Pattern Anal. Mach. Intell. 45(1), 539\u2013559 (2022)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"6","key":"4072_CR6","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3295748","volume":"51","author":"MZ Hossain","year":"2019","unstructured":"Hossain, M.Z., Sohel, F., Shiratuddin, M.F., Laga, H.: A comprehensive survey of deep learning for image captioning. ACM Computing Surveys (CsUR) 51(6), 1\u201336 (2019)","journal-title":"ACM Computing Surveys (CsUR)"},{"key":"4072_CR7","doi-asserted-by":"crossref","unstructured":"Wang, C., Yang, H., Bartz, C., Meinel, C.: Image captioning with deep bidirectional LSTMs. In: Proceedings of the 24th ACM international conference on Multimedia, (2016)","DOI":"10.1145\/2964284.2964299"},{"issue":"5","key":"4072_CR8","first-page":"62","volume":"11","author":"M Chohan","year":"2020","unstructured":"Chohan, M., Khan, A., Mahar, M.S., Hassan, S., Ghafoor, A., Khan, M.: Image captioning using deep learning: a systematic. Image 11(5), 62 (2020)","journal-title":"Image"},{"key":"4072_CR9","doi-asserted-by":"publisher","first-page":"33679","DOI":"10.1109\/ACCESS.2022.3161428","volume":"10","author":"R Castro","year":"2022","unstructured":"Castro, R., Pineda, I., Lim, W., Morocho-Cayamcela, M.E.: Deep learning approaches based on transformer architectures for image captioning tasks. IEEE Access 10, 33679\u201333694 (2022)","journal-title":"IEEE Access"},{"issue":"25","key":"4072_CR10","doi-asserted-by":"publisher","first-page":"66263","DOI":"10.1007\/s11042-023-18105-8","volume":"83","author":"BT Hung","year":"2024","unstructured":"Hung, B.T., Thu, N.H.M.: Novelty fused image and text models based on deep neural network and transformer for multimodal sentiment analysis. Multimed. Tools Appl. 83(25), 66263 (2024)","journal-title":"Multimed. Tools Appl."},{"key":"4072_CR11","doi-asserted-by":"publisher","first-page":"835","DOI":"10.1016\/j.ins.2020.09.003","volume":"546","author":"Y Ji","year":"2021","unstructured":"Ji, Y., Zhang, H., Zhang, Z., Liu, M.: CNN-based encoder-decoder networks for salient object detection: A comprehensive review and recent advances. Inf. Sci. 546, 835\u2013857 (2021)","journal-title":"Inf. Sci."},{"key":"4072_CR12","first-page":"12","volume":"7","author":"K He","year":"2015","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. 7, 12 (2015)","journal-title":"Deep residual learning for image recognition."},{"key":"4072_CR13","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)"},{"key":"4072_CR14","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Fei-Fei, L.: Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE conference on computer vision and pattern recognition (2015)","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"4072_CR15","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: A neural image caption generator. In: Proceedings of the IEEE conference on computer vision and pattern recognition (2015)","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"4072_CR16","doi-asserted-by":"publisher","first-page":"48","DOI":"10.1016\/j.neucom.2021.03.091","volume":"452","author":"Z Niu","year":"2021","unstructured":"Niu, Z., Zhong, G., Yu, H.: A review on the attention mechanism of deep learning. Neurocomputing 452, 48\u201362 (2021)","journal-title":"Neurocomputing"},{"key":"4072_CR17","doi-asserted-by":"crossref","unstructured":"Nguyen, B. T., Prakash, O., Vo, A. H.: Attention mechanism for fashion image captioning. In: Computational Intelligence Methods for Green Technology and Sustainable Development: Proceedings of the International Conference GTSD2020 5. Springer International Publishing (2021)","DOI":"10.1007\/978-3-030-62324-1_9"},{"key":"4072_CR18","doi-asserted-by":"publisher","DOI":"10.1016\/j.dsp.2025.105155","volume":"162","author":"MB Hossen","year":"2025","unstructured":"Hossen, M.B., Ye, Z., Hossain, M.S., Hossain, M.I.: ARAFNet: an attribute refinement attention fusion network for advanced visual captioning. Digital Signal Processing 162, 105155 (2025)","journal-title":"Digital Signal Processing"},{"key":"4072_CR19","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., Polosukhin, I.: Attention is all you need. Adv. Neural Inf. Proc. Syst. (2017)"},{"key":"4072_CR20","doi-asserted-by":"crossref","unstructured":"Hung, B. T., Huy, T. Q.: Named entity recognition based on combining pretrained transformer model and deep learning. In: Artificial Intelligence and Sustainable Computing: Proceedings of ICSISCET 2021. Singapore: Springer Nature Singapore (2022)","DOI":"10.1007\/978-981-19-1653-3_24"},{"key":"4072_CR21","doi-asserted-by":"crossref","unstructured":"Cornia, M., Stefanini, M., Baraldi, L., Cucchiara, R.: Meshed-memory transformer for image captioning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (2020)","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"4072_CR22","unstructured":"Li, L. H., Yatskar, M., Yin, D., Hsieh, C. J., Chang, K. W.: Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557 (2019)"},{"key":"4072_CR23","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Houlsby, N.: An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"4072_CR24","doi-asserted-by":"publisher","unstructured":"Radford, A., Kim, J. W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., Krueger, G., Sutskever, I.: Learning transferable visual models from natural language supervision. In: International Conference Machine Learning. PMLR. https:\/\/doi.org\/10.48550\/arXiv.2103.00020 (2021)","DOI":"10.48550\/arXiv.2103.00020"},{"key":"4072_CR25","unstructured":"Devlin, J., Chang, M. W., Lee, K., Toutanova, K.: Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805, (2018)"},{"key":"4072_CR26","unstructured":"Yuan, L., Chen, D., Chen, Y. L., Codella, N., Dai, X., Gao, J., Zhang, P. Florence: A new foundation model for computer vision. arXiv preprint arXiv:2111.11432 (2021)"},{"key":"4072_CR27","doi-asserted-by":"crossref","unstructured":"Desai, K., Johnson, J.: Virtex: Learning visual representations from textual annotations. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (2021)","DOI":"10.1109\/CVPR46437.2021.01101"},{"issue":"35","key":"4072_CR28","first-page":"23716","volume":"6","author":"JB Alayrac","year":"2022","unstructured":"Alayrac, J.B., Donahue, J., Luc, P., Miech, A., Barr, I., Hasson, Y., Lenc, K., Mensch, A., Millican, K., Reynolds, M., Ring, R.: Flamingo: a visual language model for few-shot learning. Adv. Neural Inf. Proc. Syst. 6(35), 23716\u201323736 (2022)","journal-title":"Adv. Neural Inf. Proc. Syst."},{"key":"4072_CR29","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In Proceedings of the IEEE conference on computer vision and pattern recognition (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"4072_CR30","unstructured":"Wang, J., Hu, X., Zhang, P., Li, X., Wang, L., Zhang, L., Liu, Z.: Minivlm: A smaller and faster vision-language model. arXiv preprint arXiv:2012.06946. (2020)."},{"key":"4072_CR31","unstructured":"Yang, X., Zhang, H., Jin, D., Liu, Y., Wu, C. H., Tan, J., Wang, X.: Fashion captioning: Towards generating accurate descriptions with semantic rewards. In Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XIII 16. Springer International Publishing. (2020)."},{"key":"4072_CR32","unstructured":"Rostamzadeh, N., Hosseini, S., Boquet, T., Stokowiec, W., Zhang, Y., Jauvin, C., Pal, C.: Fashion-gen: The generative fashion dataset and challenge. arXiv preprint arXiv:1806.08317. (2018)."},{"key":"4072_CR33","unstructured":"Tato, A., Nkambou, R.: Improving adam optimizer. (2018)."},{"key":"4072_CR34","unstructured":"Lin, C. Y. Rouge: A package for automatic evaluation of summaries. In Text summarization branches out (2004)"},{"key":"4072_CR35","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: BLEU: A method for automatic evaluation of machine translation. In: Proceedings of the Annual Meeting on Association for Computational Linguistics, Philadelphia, PA, USA, (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"4072_CR36","unstructured":"Banerjee, S., Lavie, A.: METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In: Proceedings of the Annual Meeting on Association for Computational Linguistics Workshops, Ann Arbor, MI, USA, (2005)"},{"key":"4072_CR37","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence Zitnick, C., Parikh, D.: Cider: Consensus-based image description evaluation. In: Proceedings of the IEEE conference on computer vision and pattern recognition (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"issue":"3","key":"4072_CR38","doi-asserted-by":"publisher","first-page":"1286","DOI":"10.3390\/s23031286","volume":"23","author":"N Moratelli","year":"2023","unstructured":"Moratelli, N., Barraco, M., Morelli, D., Cornia, M., Baraldi, L., Cucchiara, R.: Fashion-oriented image captioning with external knowledge retrieval and fully attentive gates. Sensors 23(3), 1286 (2023)","journal-title":"Sensors"},{"key":"4072_CR39","doi-asserted-by":"crossref","unstructured":"Qin, Y., Du, J., Zhang, Y., Lu, H.: Look Back and Predict Forward in Image Captioning. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, Long Beach, CA, USA, pp. 8367\u20138375 (2019)","DOI":"10.1109\/CVPR.2019.00856"},{"key":"4072_CR40","doi-asserted-by":"crossref","unstructured":"Yang, X., Zhang, H., Jin, D., Liu, Y., Wu, C.H., Tan, J., Xie, D., Wang, J., Wang, X.: Fashion Captioning: Towards Generating Accurate Descriptions with Semantic Rewards. arXiv 2020, arXiv:2008.02693v2.","DOI":"10.1007\/978-3-030-58601-0_1"},{"key":"4072_CR41","doi-asserted-by":"crossref","unstructured":"Cornia, M., Stefanini, M., Baraldi, L., Cucchiara, R.: Meshed-Memory Transformer for Image Captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, Virtual (2020)","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"4072_CR42","doi-asserted-by":"crossref","unstructured":"Barraco, M., Stefanini, M., Cornia, M., Cascianelli, S., Baraldi, L., Cucchiara, R.: CaMEL: Mean Teacher Learning for Image Captioning. In: Proceedings of the International Conference on Pattern Recognition, Montreal, QC, Canada, (2022)","DOI":"10.1109\/ICPR56361.2022.9955644"},{"key":"4072_CR43","doi-asserted-by":"publisher","unstructured":"Li, S., Yamaguchi, K.: Attention to describe products with attributes, In: 2017 Fifteenth IAPR International Conference on Machine Vision Applications (MVA), https:\/\/doi.org\/10.23919\/MVA.2017.7986839 (2017)","DOI":"10.23919\/MVA.2017.7986839"},{"key":"4072_CR44","unstructured":"Xu K., et al.: Show, attend and tell: Neural image caption generation with visual attention, Prepr. arXiv.1502.03044, (2015)"},{"issue":"5","key":"4072_CR45","first-page":"23716","volume":"13","author":"BT Nguyen","year":"2023","unstructured":"Nguyen, B.T., Nguyen, S.T., Vo, A.H.: Channel and spatial attention mechanism for fashion image captioning. Int. J. Electr. Computer Eng. 13(5), 23716 (2023)","journal-title":"Int. J. Electr. Computer Eng."},{"issue":"9","key":"4072_CR46","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3671000","volume":"20","author":"C Cai","year":"2025","unstructured":"Cai, C., Yap, K.H., Wang, S.: Toward attribute-controlled fashion image captioning. ACM Trans. Multimed. Comput. Commun. Appl. 20(9), 1\u201318 (2025)","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl."},{"key":"4072_CR47","doi-asserted-by":"publisher","DOI":"10.1007\/s00371-025-03824-w","author":"Y Zhang","year":"2025","unstructured":"Zhang, Y., Tong, J., Liu, H.: SCAP: enhancing image captioning through lightweight feature sifting and hierarchical decoding. Visual Computer (2025). https:\/\/doi.org\/10.1007\/s00371-025-03824-w","journal-title":"Visual Computer"},{"key":"4072_CR48","doi-asserted-by":"crossref","unstructured":"Han, X., Zhu, X., Yu, L., Zhang, L., Song, Y. Z., Xiang, T.: Fame-vil: Multi-tasking vision-language model for heterogeneous fashion tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2023)","DOI":"10.1109\/CVPR52729.2023.00262"},{"key":"4072_CR49","doi-asserted-by":"crossref","unstructured":"Zhao, X., Zhang, Y., Zhang, W., Wu, X. M.: UniFashion: A Unified Vision-Language Model for Multimodal Fashion Retrieval and Generation. In: Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing (2024)","DOI":"10.18653\/v1\/2024.emnlp-main.89"}],"container-title":["The Visual Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-025-04072-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00371-025-04072-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-025-04072-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,24]],"date-time":"2025-09-24T14:03:18Z","timestamp":1758722598000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00371-025-04072-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,1]]},"references-count":49,"journal-issue":{"issue":"13","published-print":{"date-parts":[[2025,10]]}},"alternative-id":["4072"],"URL":"https:\/\/doi.org\/10.1007\/s00371-025-04072-8","relation":{},"ISSN":["0178-2789","1432-2315"],"issn-type":[{"value":"0178-2789","type":"print"},{"value":"1432-2315","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,7,1]]},"assertion":[{"value":"17 June 2025","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"1 July 2025","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflicts of Interest"}},{"value":"This article does not contain any studies with human participants or animals performed by any of the authors.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical Approval"}}]}}