{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,20]],"date-time":"2026-02-20T19:07:04Z","timestamp":1771614424295,"version":"3.50.1"},"reference-count":45,"publisher":"Springer Science and Business Media LLC","issue":"24","license":[{"start":{"date-parts":[[2023,11,25]],"date-time":"2023-11-25T00:00:00Z","timestamp":1700870400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,11,25]],"date-time":"2023-11-25T00:00:00Z","timestamp":1700870400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Appl Intell"],"published-print":{"date-parts":[[2023,12]]},"DOI":"10.1007\/s10489-023-05167-2","type":"journal-article","created":{"date-parts":[[2023,11,25]],"date-time":"2023-11-25T03:01:32Z","timestamp":1700881292000},"page":"30803-30821","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Improving fashion captioning via attribute-based alignment and multi-level language model"],"prefix":"10.1007","volume":"53","author":[{"given":"Yuhao","family":"Tang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1549-3317","authenticated-orcid":false,"given":"Liyan","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Ye","family":"Yuan","sequence":"additional","affiliation":[]},{"given":"Zhixian","family":"Chen","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,11,25]]},"reference":[{"key":"5167_CR1","doi-asserted-by":"crossref","unstructured":"Anderson P, He X, Buehler C, Teney D, Johnson M, Gould S, Zhang L (2018) Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp\u00a06077\u20136086","DOI":"10.1109\/CVPR.2018.00636"},{"key":"5167_CR2","doi-asserted-by":"crossref","unstructured":"Bao C, Zhang X, Chen J, Miao Y (2022) Mmfl-net: multi-scale and multi-granularity feature learning for cross-domain fashion retrieval. Multimed Tools Appl 1\u201333","DOI":"10.1007\/s11042-022-13648-8"},{"issue":"4","key":"5167_CR3","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3447239","volume":"54","author":"W-H Cheng","year":"2021","unstructured":"Cheng W-H, Song S, Chen C-Y, Hidayati SC, Liu J (2021) Fashion meets computer vision: a survey. ACM Comput Surv (CSUR) 54(4):1\u201341","journal-title":"ACM Comput Surv (CSUR)"},{"key":"5167_CR4","doi-asserted-by":"crossref","unstructured":"Cornia M, Stefanini M, Baraldi L, Cucchiara R (2020) Meshed-memory transformer for image captioning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp\u00a010578\u201310587","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"5167_CR5","doi-asserted-by":"crossref","unstructured":"Denkowski M, Lavie A (2014) Meteor universal: language specific translation evaluation for any target language. In: Proceedings of the ninth workshop on statistical machine translation, pages 376\u2013380","DOI":"10.3115\/v1\/W14-3348"},{"key":"5167_CR6","doi-asserted-by":"publisher","first-page":"2287","DOI":"10.1109\/TMM.2021.3078907","volume":"24","author":"Y Ding","year":"2021","unstructured":"Ding Y, Ma Y, Liao L, Wong WK, Chua T-S (2021) Leveraging multiple relations for fashion trend forecasting based on social media. IEEE Trans Multimed 24:2287\u20132299","journal-title":"IEEE Trans Multimed"},{"issue":"5","key":"5167_CR7","doi-asserted-by":"publisher","DOI":"10.1016\/j.ipm.2020.102276","volume":"5","author":"X Gu","year":"2020","unstructured":"Gu X, Gao F, Tan M, Peng P (2020) Fashion analysis and understanding with artificial intelligence. Inf Process Manag 5(5):102276","journal-title":"Inf Process Manag"},{"key":"5167_CR8","doi-asserted-by":"crossref","unstructured":"Huang L, Wang W, Chen J, Wei X-Y (2019) Attention on attention for image captioning. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp\u00a04634\u20134643","DOI":"10.1109\/ICCV.2019.00473"},{"key":"5167_CR9","doi-asserted-by":"crossref","unstructured":"Jain A, Samala PR, Jyothi P, Mittal D, Singh MK (2021) Perturb, predict & paraphrase: Semi-supervised learning using noisy student for image captioning. In: IJCAI, pp\u00a0758\u2013764","DOI":"10.24963\/ijcai.2021\/105"},{"issue":"9","key":"5167_CR10","doi-asserted-by":"publisher","first-page":"4538","DOI":"10.1109\/TNNLS.2021.3057892","volume":"33","author":"S Jiang","year":"2021","unstructured":"Jiang S, Li J, Fu Y (2021) Deep learning for fashion style generation. IEEE Trans Neural Networks and Learn Syst 33(9):4538\u20134550","journal-title":"IEEE Trans Neural Networks and Learn Syst"},{"key":"5167_CR11","doi-asserted-by":"crossref","unstructured":"Kang Y, Yu B, Xu Z (2023) A novel approach to multi-attribute predictive analysis based on rough fuzzy sets. Appl Intell 1\u201318","DOI":"10.1007\/s10489-022-04360-z"},{"issue":"7","key":"5167_CR12","doi-asserted-by":"publisher","first-page":"10681","DOI":"10.1007\/s11042-022-13714-1","volume":"82","author":"N Kaur","year":"2023","unstructured":"Kaur N, Pandey S (2023) Predicting clothing attributes with cnn and surf based classification model. Multimed Tools Appl 82(7):10681\u201310701","journal-title":"Multimed Tools Appl"},{"key":"5167_CR13","doi-asserted-by":"publisher","first-page":"68","DOI":"10.1016\/j.patrec.2020.12.001","volume":"141","author":"X Li","year":"2021","unstructured":"Li X, Ye Z, Zhang Z, Zhao M (2021) Clothes image caption generation with attribute detection and visual attention model. Pattern Recognit Lett 141:68\u201374","journal-title":"Pattern Recognit Lett"},{"issue":"6","key":"5167_CR14","doi-asserted-by":"publisher","first-page":"3685","DOI":"10.1109\/TCSVT.2021.3107035","volume":"32","author":"A-A Liu","year":"2021","unstructured":"Liu A-A, Zhai Y, Xu N, Nie W, Li W, Zhang Y (2021) Region-aware image captioning via interaction learning. IEEE Trans Circ Syst Video Technol 32(6):3685\u20133696","journal-title":"IEEE Trans Circ Syst Video Technol"},{"issue":"9","key":"5167_CR15","first-page":"1","volume":"55","author":"P Liu","year":"2021","unstructured":"Liu P, Yuan W, Fu J, Jiang Z, Hayashi H, Neubig G (2021) Pre-train, prompt, and predict: a systematic survey of prompting methods in natural language processing. ACM Comput Surv 55(9):1\u201335","journal-title":"ACM Comput Surv"},{"key":"5167_CR16","doi-asserted-by":"crossref","unstructured":"Liu Z, Luo P, Qiu S, Wang X, Tang X (2016) Deepfashion: powering robust clothes recognition and retrieval with rich annotations. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp\u00a01096\u20131104","DOI":"10.1109\/CVPR.2016.124"},{"key":"5167_CR17","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2023.109420","volume":"138","author":"Y Ma","year":"2023","unstructured":"Ma Y, Ji J, Sun X, Zhou Y, Ji R (2023) Towards local visual modeling for image captioning. Pattern Recognit 138:109420","journal-title":"Pattern Recognit"},{"key":"5167_CR18","unstructured":"Min B, Ross H, Sulem E, Veyseh APB, Nguyen TH, Sainz O, Agirre E, Heintz I, Roth D (2021) Recent advances in natural language processing via large pre-trained language models: a survey. ACM Comput Surv"},{"key":"5167_CR19","doi-asserted-by":"crossref","unstructured":"Papineni K, Roukos S, Ward T, Zhu W-J (2002) Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th annual meeting of the association for computational linguistics, pp\u00a0311\u2013318","DOI":"10.3115\/1073083.1073135"},{"issue":"13","key":"5167_CR20","doi-asserted-by":"publisher","first-page":"14711","DOI":"10.1007\/s10489-022-03463-x","volume":"52","author":"J Prudviraj","year":"2022","unstructured":"Prudviraj J, Vishnu C, Mohan CK (2022) M-ffn: multi-scale feature fusion network for image captioning. Appl Intell 52(13):14711\u201314723","journal-title":"Appl Intell"},{"key":"5167_CR21","doi-asserted-by":"crossref","unstructured":"Rennie SJ, Marcheret E, Mroueh Y, Ross J, Goel V (2017) Self-critical sequence training for image captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp\u00a07008\u20137024","DOI":"10.1109\/CVPR.2017.131"},{"key":"5167_CR22","unstructured":"Rostamzadeh N, Hosseini S, Boquet T, Stokowiec W, Zhang Y, Jauvin C, Pal C (2018) Fashion-gen: the generative fashion dataset and challenge. arXiv:1806.08317"},{"issue":"11","key":"5167_CR23","doi-asserted-by":"publisher","first-page":"3551","DOI":"10.1007\/s00371-021-02178-3","volume":"38","author":"M Shajini","year":"2022","unstructured":"Shajini M, Ramanan A (2022) A knowledge-sharing semi-supervised approach for fashion clothes classification and attribute prediction. Vis Comput 38(11):3551\u20133561","journal-title":"Vis Comput"},{"key":"5167_CR24","doi-asserted-by":"crossref","unstructured":"Sharma D, Dhiman C, Kumar D (2023) Evolution of visual data captioning methods, datasets, and evaluation metrics: a comprehensive survey. Expert Syst Appl 119773","DOI":"10.1016\/j.eswa.2023.119773"},{"issue":"1","key":"5167_CR25","doi-asserted-by":"publisher","first-page":"539","DOI":"10.1109\/TPAMI.2022.3148210","volume":"45","author":"M Stefanini","year":"2022","unstructured":"Stefanini M, Cornia M, Baraldi L, Cascianelli S, Fiameni G, Cucchiara R (2022) From show to tell: a survey on deep learning-based image captioning. IEEE Trans Pattern Anal Mach Intell 45(1):539\u2013559","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"5167_CR26","doi-asserted-by":"crossref","unstructured":"Vedantam R, Lawrence\u00a0Zitnick C, Parikh D (2015) Cider: consensus-based image description evaluation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp\u00a04566\u20134575","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"5167_CR27","doi-asserted-by":"publisher","DOI":"10.1016\/j.engappai.2022.105194","volume":"114","author":"C Wang","year":"2022","unstructured":"Wang C, Gu X (2022) Dynamic-balanced double-attention fusion for image captioning. Eng Appl Artif Intell 114:105194","journal-title":"Eng Appl Artif Intell"},{"key":"5167_CR28","doi-asserted-by":"crossref","unstructured":"Wang C, Gu X (2022) Image captioning with adaptive incremental global context attention. Appl Intell 1\u201323","DOI":"10.1007\/s10489-021-02734-3"},{"key":"5167_CR29","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2022.117174","volume":"201","author":"C Wang","year":"2022","unstructured":"Wang C, Shen Y, Ji L (2022) Geometry attention transformer with position-aware lstms for image captioning. Expert Syst Appl 201:117174","journal-title":"Expert Syst Appl"},{"issue":"9","key":"5167_CR30","doi-asserted-by":"publisher","first-page":"6201","DOI":"10.1002\/int.22840","volume":"37","author":"D Wu","year":"2022","unstructured":"Wu D, Li Z, Zhou J, Gan J, Gao W, Li H (2022) Clothing attribute recognition via a holistic relation network. Int J Intell Syst 37(9):6201\u20136220","journal-title":"Int J Intell Syst"},{"key":"5167_CR31","doi-asserted-by":"crossref","unstructured":"Wu H, Gao Y, Guo X, Al-Halah Z, Rennie S, Grauman K, Feris R (2021) Fashion iq: A new dataset towards retrieving images by natural language feedback. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp\u00a011307\u201311317","DOI":"10.1109\/CVPR46437.2021.01115"},{"key":"5167_CR32","doi-asserted-by":"publisher","first-page":"129","DOI":"10.1016\/j.neunet.2022.01.011","volume":"148","author":"T Xian","year":"2022","unstructured":"Xian T, Li Z, Zhang C, Ma H (2022) Dual global enhanced transformer for image captioning. Neural Netw 148:129\u2013141","journal-title":"Neural Netw"},{"key":"5167_CR33","doi-asserted-by":"crossref","unstructured":"Xu P, Zhu X, Clifton DA (2023) Multimodal learning with transformers: a survey. IEEE Trans Pattern Anal Mach Intell","DOI":"10.1109\/TPAMI.2023.3275156"},{"key":"5167_CR34","doi-asserted-by":"crossref","unstructured":"Yang X, Zhang H, Jin D, Liu Y, Wu C-H, Tan J, Xie D, Wang J, Wang X (2020) Fashion captioning: towards generating accurate descriptions with semantic rewards. In: European conference on computer vision, Springer, pp\u00a01\u201317","DOI":"10.1007\/978-3-030-58601-0_1"},{"key":"5167_CR35","first-page":"1","volume":"60","author":"Z Yuan","year":"2022","unstructured":"Yuan Z, Mou L, Wang Q, Zhu XX (2022) From easy to hard: Learning language-guided curriculum for visual question answering on remote sensing data. IEEE Trans Geosci Remote Sens 60:1\u201311","journal-title":"IEEE Trans Geosci Remote Sens"},{"key":"5167_CR36","doi-asserted-by":"publisher","first-page":"3548","DOI":"10.1007\/s10489-020-01950-7","volume":"51","author":"X Yue","year":"2021","unstructured":"Yue X, Zhang C, Fujita H, Lv Y (2021) Clothing fashion style recognition with design issue graph. Appl Intell 51:3548\u20133560","journal-title":"Appl Intell"},{"key":"5167_CR37","doi-asserted-by":"crossref","unstructured":"Zeng F, Zhao M, Zhang Z, Gao S, Cheng L (2022) Joint clothes detection and attribution prediction via anchor-free framework with decoupled representation transformer. In: Proceedings of the 31st ACM international conference on information & knowledge management, pp\u00a02444\u20132454","DOI":"10.1145\/3511808.3557369"},{"key":"5167_CR38","doi-asserted-by":"crossref","unstructured":"Zhang J, Fang Z, Sun H, Wang Z (2022) Adaptive semantic-enhanced transformer for image captioning. IEEE Trans Neural Netw Learn Syst","DOI":"10.1109\/TNNLS.2022.3185320"},{"key":"5167_CR39","doi-asserted-by":"crossref","unstructured":"Zhang J, Fang Z, Wang Z (2022) Multi-feature fusion enhanced transformer with multi-layer fused decoding for image captioning. Appl Intell pp\u00a01\u201317","DOI":"10.1007\/s10489-022-04202-y"},{"key":"5167_CR40","doi-asserted-by":"crossref","unstructured":"Zhang X, Sun X, Luo Y, Ji J, Zhou Y, Wu Y, Huang F, Ji R (2021) Rstnet: captioning with adaptive attention on visual and non-visual words. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp\u00a015465\u201315474","DOI":"10.1109\/CVPR46437.2021.01521"},{"key":"5167_CR41","doi-asserted-by":"crossref","unstructured":"Zhang Z, Shi Y, Yuan C, Li B, Wang P, Hu W, Zha Z-J (2020) Object relational graph with teacher-recommended learning for video captioning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp\u00a013278\u201313288","DOI":"10.1109\/CVPR42600.2020.01329"},{"key":"5167_CR42","doi-asserted-by":"crossref","unstructured":"Zhou Y, Zhang Y, Hu Z, Wang M (2021) Semi-autoregressive transformer for image captioning. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp\u00a03139\u20133143","DOI":"10.1109\/ICCVW54120.2021.00350"},{"key":"5167_CR43","doi-asserted-by":"publisher","first-page":"62","DOI":"10.1016\/j.neucom.2022.04.121","volume":"495","author":"Z Zhou","year":"2022","unstructured":"Zhou Z, Su Z, Wang R (2022) Attribute-aware heterogeneous graph network for fashion compatibility prediction. Neurocomputing 495:62\u201374","journal-title":"Neurocomputing"},{"key":"5167_CR44","doi-asserted-by":"crossref","unstructured":"Zhuge M, Gao D, Fan D-P, Jin L, Chen B, Zhou H, Qiu M, Shao L (2021) Kaleido-bert: vision-language pre-training on fashion domain. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp\u00a012647\u201312657","DOI":"10.1109\/CVPR46437.2021.01246"},{"issue":"5","key":"5167_CR45","doi-asserted-by":"publisher","first-page":"3833","DOI":"10.1007\/s10462-021-10092-2","volume":"55","author":"Z Zohourianshahzadi","year":"2022","unstructured":"Zohourianshahzadi Z, Kalita JK (2022) Neural attention for image captioning: review of outstanding methods. Artif Intell Rev 55(5):3833\u20133862","journal-title":"Artif Intell Rev"}],"container-title":["Applied Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-023-05167-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10489-023-05167-2\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-023-05167-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,12,28]],"date-time":"2023-12-28T06:28:22Z","timestamp":1703744902000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10489-023-05167-2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,11,25]]},"references-count":45,"journal-issue":{"issue":"24","published-print":{"date-parts":[[2023,12]]}},"alternative-id":["5167"],"URL":"https:\/\/doi.org\/10.1007\/s10489-023-05167-2","relation":{},"ISSN":["0924-669X","1573-7497"],"issn-type":[{"value":"0924-669X","type":"print"},{"value":"1573-7497","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,11,25]]},"assertion":[{"value":"7 November 2023","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 November 2023","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"We declare that we do not have any commercial or associative interest that represents a conflict of interest in connection with the work submitted.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflicts of interest"}}]}}