{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,6]],"date-time":"2026-04-06T19:25:07Z","timestamp":1775503507644,"version":"3.50.1"},"reference-count":57,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004607","name":"Guangxi Natural Science Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100004607","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neural Networks"],"published-print":{"date-parts":[[2026,6]]},"DOI":"10.1016\/j.neunet.2026.108629","type":"journal-article","created":{"date-parts":[[2026,1,20]],"date-time":"2026-01-20T17:23:27Z","timestamp":1768929807000},"page":"108629","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Multi-modal feature alignment networks for multi-label image classification"],"prefix":"10.1016","volume":"198","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2727-7969","authenticated-orcid":false,"given":"Wenlan","family":"Kuang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5313-6134","authenticated-orcid":false,"given":"Zhixin","family":"Li","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.neunet.2026.108629_bib0001","series-title":"Proceedings of the 16th European conference on computer vision","first-page":"213","article-title":"End-to-end object detection with transformers","author":"Carion","year":"2020"},{"key":"10.1016\/j.neunet.2026.108629_bib0002","series-title":"Proceedings of the AAAI conference on artificial intelligence","first-page":"6714","article-title":"Order-free rnn with visual attention for multi-label classification","author":"Chen","year":"2018"},{"issue":"3","key":"10.1016\/j.neunet.2026.108629_bib0003","doi-asserted-by":"crossref","first-page":"1371","DOI":"10.1109\/TPAMI.2020.3025814","article-title":"Knowledge-guided multi-label few-shot learning for general image recognition","volume":"44","author":"Chen","year":"2022","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"10.1016\/j.neunet.2026.108629_bib0004","series-title":"Proceedings of the AAAI conference on artificial intelligence","first-page":"6730","article-title":"Recurrent attentional reinforcement learning for multi-label image recognition","author":"Chen","year":"2018"},{"key":"10.1016\/j.neunet.2026.108629_bib0005","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision","first-page":"522","article-title":"Learning semantic-specific graph representation for multi-label image recognition","author":"Chen","year":"2019"},{"key":"10.1016\/j.neunet.2026.108629_bib0006","doi-asserted-by":"crossref","first-page":"2570","DOI":"10.1109\/TIP.2022.3148867","article-title":"SST: Spatial and semantic transformers for multi-label image recognition","volume":"31","author":"Chen","year":"2022","journal-title":"IEEE Transactions on Image Processing"},{"key":"10.1016\/j.neunet.2026.108629_bib0007","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"5177","article-title":"Multi-label image recognition with graph convolutional networks","author":"Chen","year":"2019"},{"key":"10.1016\/j.neunet.2026.108629_bib0008","series-title":"Proceedings of the IEEE international conference on multimedia and expo","first-page":"1","article-title":"MLTR: Multi-label classification with transformer","author":"Cheng","year":"2022"},{"key":"10.1016\/j.neunet.2026.108629_bib0009","series-title":"Proceedings of the ACM international conference on image and video retrieval","first-page":"1","article-title":"NUS-wide: A real-world web image database from national university of Singapore","author":"Chua","year":"2009"},{"key":"10.1016\/j.neunet.2026.108629_bib0010","unstructured":"Dao, S. D., Zhao, E., Phung, D., & Cai, J. (2021). Multi-label image classification with contrastive learning. arXiv preprint arXiv: 2107.11626."},{"key":"10.1016\/j.neunet.2026.108629_bib0011","doi-asserted-by":"crossref","first-page":"4013","DOI":"10.1109\/TMM.2022.3171095","article-title":"Beyond word embeddings: Heterogeneous prior knowledge driven multi-label image classification","volume":"25","author":"Deng","year":"2023","journal-title":"IEEE Transactions on Multimedia"},{"key":"10.1016\/j.neunet.2026.108629_bib0012","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S. et al. (2020). An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv: 2010.11929."},{"key":"10.1016\/j.neunet.2026.108629_bib0013","doi-asserted-by":"crossref","first-page":"98","DOI":"10.1007\/s11263-014-0733-5","article-title":"The pascal visual object classes challenge: A retrospective","volume":"111","author":"Everingham","year":"2015","journal-title":"International Journal of Computer Vision"},{"key":"10.1016\/j.neunet.2026.108629_bib0014","doi-asserted-by":"crossref","first-page":"5920","DOI":"10.1109\/TIP.2021.3088605","article-title":"Learning to discover multi-class attentional regions for multi-label image recognition","volume":"30","author":"Gao","year":"2021","journal-title":"IEEE Transactions on Image Processing"},{"key":"10.1016\/j.neunet.2026.108629_bib0015","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2023.122111","article-title":"Mineral identification based on natural feature-oriented image processing and multi-label image classification","volume":"238","author":"Gao","year":"2024","journal-title":"Expert Systems with Applications"},{"key":"10.1016\/j.neunet.2026.108629_bib0016","unstructured":"Gong, Y., Jia, Y., Leung, T., Toshev, A., & Ioffe, S. (2013). Deep convolutional ranking for multilabel image annotation. arXiv preprint arXiv: 1312.4894."},{"key":"10.1016\/j.neunet.2026.108629_bib0017","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"729","article-title":"Visual attention consistency under image transforms for multi-label image classification","author":"Guo","year":"2019"},{"key":"10.1016\/j.neunet.2026.108629_bib0018","series-title":"Advances in neural information processing systems","first-page":"8291","article-title":"Vision GNN: An image is worth graph of nodes","author":"Han","year":"2022"},{"key":"10.1016\/j.neunet.2026.108629_bib0019","series-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","first-page":"770","article-title":"Deep residual learning for image recognition","author":"He","year":"2016"},{"key":"10.1016\/j.neunet.2026.108629_bib0020","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"16478","article-title":"General multi-label image classification with transformers","author":"Lanchantin","year":"2021"},{"key":"10.1016\/j.neunet.2026.108629_bib0021","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision","first-page":"15348","article-title":"PatchCT: Aligning patch set and label set with conditional transport for multi-label image classification","author":"Li","year":"2023"},{"issue":"1","key":"10.1016\/j.neunet.2026.108629_bib0022","first-page":"1","article-title":"A semi-supervised learning approach based on adaptive weighted fusion for automatic image annotation","volume":"17","author":"Li","year":"2021","journal-title":"ACM Transactions on Multimedia Computing, Communications, and Applications"},{"key":"10.1016\/j.neunet.2026.108629_bib0023","series-title":"Proceedings of 13th European conference on computer vision","first-page":"740","article-title":"Microsoft CoCo: Common objects in context","author":"Lin","year":"2014"},{"key":"10.1016\/j.neunet.2026.108629_bib0024","series-title":"Proceedings of the 11th international conference on learning representations","first-page":"1","article-title":"Causality compensated attention for contextual biased visual recognition","author":"Liu","year":"2023"},{"key":"10.1016\/j.neunet.2026.108629_bib0025","unstructured":"Liu, S., Zhang, L., Yang, X., Su, H., & Zhu, J. (2021). Query2Label: A simple transformer way to multi-label classification. arXiv preprint arXiv: 2107.10834."},{"key":"10.1016\/j.neunet.2026.108629_bib0026","doi-asserted-by":"crossref","unstructured":"Ma, L., Sun, D., Wang, L., Zhao, H., & Luo, B. (2023). Semantic-aware dual contrastive learning for multi-label image classification. arXiv preprint arXiv: 2307.09715.","DOI":"10.3233\/FAIA230449"},{"key":"10.1016\/j.neunet.2026.108629_bib0027","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision","first-page":"82","article-title":"Asymmetric loss for multi-label classification","author":"Ridnik","year":"2021"},{"key":"10.1016\/j.neunet.2026.108629_bib0028","doi-asserted-by":"crossref","unstructured":"Sajedi, A., Khaki, S., Lawryshyn, Y. A., & Plataniotis, K. N. (2024). ProbMCL: Simple probabilistic contrastive learning for multi-label visual classification. arXiv preprint arXiv: 2401.01448.","DOI":"10.1109\/ICASSP48485.2024.10447400"},{"key":"10.1016\/j.neunet.2026.108629_bib0029","doi-asserted-by":"crossref","first-page":"626","DOI":"10.1016\/j.neunet.2023.08.052","article-title":"Causal multi-label learning for image classification","volume":"167","author":"Tian","year":"2023","journal-title":"Neural Networks"},{"key":"10.1016\/j.neunet.2026.108629_bib0030","series-title":"Advances in neural information processing systems","first-page":"5998","article-title":"Attention is all you need","author":"Vaswani","year":"2017"},{"key":"10.1016\/j.neunet.2026.108629_bib0031","series-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","first-page":"2285","article-title":"CNN-RNN: A unified framework for multi-label image classification","author":"Wang","year":"2016"},{"key":"10.1016\/j.neunet.2026.108629_bib0032","series-title":"Proceedings of the AAAI conference on artificial intelligence","first-page":"12265","article-title":"Multi-label classification with label graph superimposing","author":"Wang","year":"2020"},{"key":"10.1016\/j.neunet.2026.108629_bib0033","series-title":"Proceedings of the IEEE international conference on computer vision","first-page":"464","article-title":"Multi-label image recognition by recurrently discovering attentional regions","author":"Wang","year":"2017"},{"issue":"9","key":"10.1016\/j.neunet.2026.108629_bib0034","doi-asserted-by":"crossref","first-page":"1901","DOI":"10.1109\/TPAMI.2015.2491929","article-title":"HCP: A flexible cnn framework for multi-label image classification","volume":"38","author":"Wei","year":"2016","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"issue":"11","key":"10.1016\/j.neunet.2026.108629_bib0035","doi-asserted-by":"crossref","first-page":"6788","DOI":"10.1109\/TCSVT.2023.3268997","article-title":"Semantic-aware graph matching mechanism for multi-label image recognition","volume":"33","author":"Wu","year":"2023","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology"},{"issue":"2","key":"10.1016\/j.neunet.2026.108629_bib0036","doi-asserted-by":"crossref","first-page":"924","DOI":"10.1109\/TCSVT.2023.3288205","article-title":"Transformer driven matching selection mechanism for multi-label image classification","volume":"34","author":"Wu","year":"2024","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology"},{"issue":"1","key":"10.1016\/j.neunet.2026.108629_bib0037","doi-asserted-by":"crossref","DOI":"10.1016\/j.ipm.2022.103154","article-title":"Unifying knowledge iterative dissemination and relational reconstruction network for image\u2013text matching","volume":"60","author":"Xie","year":"2023","journal-title":"Information Processing and Management"},{"key":"10.1016\/j.neunet.2026.108629_bib0038","doi-asserted-by":"crossref","first-page":"1696","DOI":"10.1109\/TMM.2020.3002185","article-title":"Joint input and output space learning for multi-label image classification","volume":"23","author":"Xu","year":"2021","journal-title":"IEEE Transactions on Multimedia"},{"key":"10.1016\/j.neunet.2026.108629_bib0039","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision","first-page":"20349","article-title":"Human-centric scene understanding for 3D large-scale scenarios","author":"Xu","year":"2023"},{"key":"10.1016\/j.neunet.2026.108629_bib0040","doi-asserted-by":"crossref","unstructured":"Yao, R., Jin, S., Xu, L., Zeng, W., Liu, W., Qian, C., Luo, P., & Wu, J. (2023). GKGNet: Group k-nearest neighbor based graph convolutional network for multi-label image recognition. arXiv preprint arXiv: 2308.14378.","DOI":"10.1007\/978-3-031-72649-1_6"},{"key":"10.1016\/j.neunet.2026.108629_bib0041","series-title":"Proceedings of the 16th European conference on computer vision","first-page":"649","article-title":"Attention-driven dynamic graph convolutional network for multi-label image recognition","author":"Ye","year":"2020"},{"key":"10.1016\/j.neunet.2026.108629_bib0042","series-title":"Proceedings of the AAAI conference on artificial intelligence","first-page":"12709","article-title":"Cross-modality attention with semantic graph embedding for multi-label classification","author":"You","year":"2020"},{"key":"10.1016\/j.neunet.2026.108629_bib0043","doi-asserted-by":"crossref","first-page":"322","DOI":"10.1016\/j.patcog.2019.03.006","article-title":"Delta: A deep dual-stream network for multi-label image classification","volume":"91","author":"Yu","year":"2019","journal-title":"Pattern Recognition"},{"key":"10.1016\/j.neunet.2026.108629_bib0044","doi-asserted-by":"crossref","first-page":"129","DOI":"10.1016\/j.neunet.2023.08.023","article-title":"Graph embedding based multi-label zero-shot learning","volume":"167","author":"Zhang","year":"2023","journal-title":"Neural Networks"},{"key":"10.1016\/j.neunet.2026.108629_bib0045","doi-asserted-by":"crossref","first-page":"3000","DOI":"10.1109\/TIP.2023.3266161","article-title":"Spatial context-aware object-attentional network for multi-label image classification","volume":"32","author":"Zhang","year":"2023","journal-title":"IEEE Transactions on Image Processing"},{"issue":"10","key":"10.1016\/j.neunet.2026.108629_bib0046","doi-asserted-by":"crossref","first-page":"2801","DOI":"10.1109\/TMM.2018.2812605","article-title":"Multilabel image classification with regional latent semantic dependencies","volume":"20","author":"Zhang","year":"2018","journal-title":"IEEE Transactions on Multimedia"},{"key":"10.1016\/j.neunet.2026.108629_bib0047","series-title":"Proceedings of the AAAI conference on artificial intelligence","first-page":"16786","article-title":"Multi-label supervised contrastive learning","author":"Zhang","year":"2024"},{"key":"10.1016\/j.neunet.2026.108629_bib0048","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision","first-page":"163","article-title":"Transformer-based dual relation graph for multi-label image recognition","author":"Zhao","year":"2021"},{"key":"10.1016\/j.neunet.2026.108629_bib0049","series-title":"Proceedings of the 29th ACM international conference on multimedia","first-page":"469","article-title":"M3TR: Multi-modal multi-label recognition with transformer","author":"Zhao","year":"2021"},{"key":"10.1016\/j.neunet.2026.108629_bib0050","doi-asserted-by":"crossref","first-page":"496","DOI":"10.1016\/j.neunet.2023.09.031","article-title":"A multidimensional feature fusion network based on MGSE and TAAC for video-based human action recognition","volume":"168","author":"Zhou","year":"2023","journal-title":"Neural Networks"},{"key":"10.1016\/j.neunet.2026.108629_bib0051","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2022.109203","article-title":"Feature learning network with transformer for multi-label image classification","volume":"136","author":"Zhou","year":"2023","journal-title":"Pattern Recognition"},{"key":"10.1016\/j.neunet.2026.108629_bib0052","doi-asserted-by":"crossref","first-page":"1143","DOI":"10.1109\/TMM.2023.3277279","article-title":"Mining semantic information with dual relation graph network for multi-label image classification","volume":"26","author":"Zhou","year":"2024","journal-title":"IEEE Transactions on Multimedia"},{"issue":"1","key":"10.1016\/j.neunet.2026.108629_bib0053","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3519030","article-title":"Double attention based on graph attention network for image multi-label classification","volume":"19","author":"Zhou","year":"2023","journal-title":"ACM Transactions on Multimedia Computing, Communications and Applications"},{"issue":"1","key":"10.1016\/j.neunet.2026.108629_bib0054","doi-asserted-by":"crossref","first-page":"342","DOI":"10.1109\/TCSVT.2023.3284812","article-title":"DATran: Dual attention transformer for multi-label image classification","volume":"34","author":"Zhou","year":"2024","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology"},{"key":"10.1016\/j.neunet.2026.108629_bib0055","series-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","first-page":"5513","article-title":"Learning spatial regularization with image-level supervisions for multi-label image classification","author":"Zhu","year":"2017"},{"key":"10.1016\/j.neunet.2026.108629_bib0056","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision","first-page":"184","article-title":"Residual attention: A simple but effective method for multi-label recognition","author":"Zhu","year":"2021"},{"key":"10.1016\/j.neunet.2026.108629_bib0057","series-title":"Proceedings of the 30th ACM international conference on multimedia","first-page":"3598","article-title":"Two-stream transformer for multi-label image classification","author":"Zhu","year":"2022"}],"container-title":["Neural Networks"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608026000912?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608026000912?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,4,6]],"date-time":"2026-04-06T18:37:57Z","timestamp":1775500677000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0893608026000912"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6]]},"references-count":57,"alternative-id":["S0893608026000912"],"URL":"https:\/\/doi.org\/10.1016\/j.neunet.2026.108629","relation":{},"ISSN":["0893-6080"],"issn-type":[{"value":"0893-6080","type":"print"}],"subject":[],"published":{"date-parts":[[2026,6]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Multi-modal feature alignment networks for multi-label image classification","name":"articletitle","label":"Article Title"},{"value":"Neural Networks","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neunet.2026.108629","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"108629"}}