{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,18]],"date-time":"2026-01-18T00:01:55Z","timestamp":1768694515451,"version":"3.49.0"},"reference-count":47,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2024,9,27]],"date-time":"2024-09-27T00:00:00Z","timestamp":1727395200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,9,27]],"date-time":"2024-09-27T00:00:00Z","timestamp":1727395200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"Fujian University of Technology Research Fund Project","award":["GY-Z220212"],"award-info":[{"award-number":["GY-Z220212"]}]},{"name":"Fujian University of Technology Research Fund Project","award":["GY-Z220212"],"award-info":[{"award-number":["GY-Z220212"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int. J. Mach. Learn. &amp; Cyber."],"published-print":{"date-parts":[[2025,4]]},"DOI":"10.1007\/s13042-024-02398-8","type":"journal-article","created":{"date-parts":[[2024,9,27]],"date-time":"2024-09-27T22:01:54Z","timestamp":1727474514000},"page":"2401-2415","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Fine-grained multimodal named entity recognition with heterogeneous image-text similarity graphs"],"prefix":"10.1007","volume":"16","author":[{"given":"YongPeng","family":"Wang","sequence":"first","affiliation":[]},{"given":"ChunMao","family":"Jiang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,9,27]]},"reference":[{"key":"2398_CR1","doi-asserted-by":"crossref","unstructured":"Kruengkrai C, Nguyen TH, Aljunied SM, Bing L (2020) Improving low-resource named entity recognition using joint sentence and token labeling. In: Proceedings of the 58th annual meeting of the association for computational linguistics. pp 5898\u20135905","DOI":"10.18653\/v1\/2020.acl-main.523"},{"key":"2398_CR2","doi-asserted-by":"crossref","unstructured":"Lu D, Neves L, Carvalho V, Zhang N, Ji H (2018) Visual attention model for name tagging in multimodal social media. In: Proceedings of the 56th annual meeting of the association for computational linguistics, vol 1. Long papers. pp 1990\u20131999","DOI":"10.18653\/v1\/P18-1185"},{"key":"2398_CR3","doi-asserted-by":"crossref","unstructured":"Zheng C, Wu Z, Feng J, Fu Z, Cai Y (2021). MNRE: a challenge multimodal dataset for neural relation extraction with visual evidence in social media posts. In: 2021 IEEE international conference on multimedia and expo (ICME). IEEE, pp 1\u20136","DOI":"10.1109\/ICME51207.2021.9428274"},{"key":"2398_CR4","doi-asserted-by":"crossref","unstructured":"Zhao Y, Wang W, Zhang H, Hu B (2021) Learning homogeneous and heterogeneous co-occurrences for unsupervised cross-modal retrieval. In: 2021 IEEE international conference on multimedia and expo (ICME). IEEE, pp 1\u20136","DOI":"10.1109\/ICME51207.2021.9428240"},{"key":"2398_CR5","doi-asserted-by":"crossref","unstructured":"Yu J, Jiang J, Yang L, Xia R (2020) Improving multimodal named entity recognition via entity span detection with unified multimodal transformer. Association for Computational Linguistics","DOI":"10.18653\/v1\/2020.acl-main.306"},{"key":"2398_CR6","doi-asserted-by":"crossref","unstructured":"Sun L, Wang J, Zhang K, Su Y, Weng F (2021). RpBERT: a text-image relation propagation-based BERT model for multimodal NER. In: Proceedings of the AAAI conference on artificial intelligence, vol 35. pp 13860\u201313868","DOI":"10.1609\/aaai.v35i15.17633"},{"key":"2398_CR7","doi-asserted-by":"crossref","unstructured":"Zhang D, Wei S, Li S, Wu H, Zhu Q, Zhou G (2021) Multi-modal graph fusion for named entity recognition with targeted visual guidance. In: Proceedings of the AAAI conference on artificial intelligence, vol 35. pp 14347\u201314355","DOI":"10.1609\/aaai.v35i16.17687"},{"key":"2398_CR8","doi-asserted-by":"publisher","first-page":"111","DOI":"10.1109\/TASLP.2022.3221017","volume":"31","author":"J Wang","year":"2022","unstructured":"Wang J, Yang Y, Liu K, Zhu Z, Liu X (2022) M3S: scene graph driven multi-granularity multi-task learning for multi-modal NER. IEEE\/ACM Trans Audio Speech Lang Process 31:111\u2013120","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process"},{"key":"2398_CR9","doi-asserted-by":"publisher","first-page":"47","DOI":"10.1016\/j.ins.2020.11.024","volume":"554","author":"J Su","year":"2021","unstructured":"Su J, Chen J, Jiang H, Zhou C, Lin H, Ge Y, Wu Q, Lai Y (2021) Multi-modal neural machine translation with deep semantic interactions. Inf Sci 554:47\u201360","journal-title":"Inf Sci"},{"key":"2398_CR10","doi-asserted-by":"crossref","unstructured":"Wang D, Xiong D (2021) Efficient object-level visual context modeling for multimodal machine translation: masking irrelevant objects helps grounding. In: Proceedings of the AAAI conference on artificial intelligence, vol 35. pp 2720\u20132728","DOI":"10.1609\/aaai.v35i4.16376"},{"key":"2398_CR11","doi-asserted-by":"crossref","unstructured":"Ju X, Zhang D, Xiao R, Li J, Li S, Zhang M, Zhou G (2021) Joint multi-modal aspect-sentiment analysis with auxiliary cross-modal relation detection. In: Proceedings of the 2021 conference on empirical methods in natural language processing. pp 4395\u20134405","DOI":"10.18653\/v1\/2021.emnlp-main.360"},{"key":"2398_CR12","doi-asserted-by":"crossref","unstructured":"Yu W, Xu H, Yuan Z, Wu J (2021) Learning modality-specific representations with self-supervised multi-task learning for multimodal sentiment analysis. In: Proceedings of the AAAI conference on artificial intelligence, vol 35. pp 10790\u201310797","DOI":"10.1609\/aaai.v35i12.17289"},{"key":"2398_CR13","doi-asserted-by":"crossref","unstructured":"Moon S, Neves L, Carvalho V (2018) Multimodal named entity recognition for short social media posts. arXiv preprint arXiv:1802.07862","DOI":"10.18653\/v1\/N18-1078"},{"key":"2398_CR14","doi-asserted-by":"crossref","unstructured":"Zhang Q, Fu J, Liu X, Huang X (2018) Adaptive co-attention network for named entity recognition in tweets. In: Proceedings of the AAAI conference on artificial intelligence, vol 32","DOI":"10.1609\/aaai.v32i1.11962"},{"key":"2398_CR15","doi-asserted-by":"crossref","unstructured":"Wu Z, Zheng C, Cai Y, Chen J, Leung H-F, Li Q (2020) Multimodal representation with embedded visual guiding objects for named entity recognition in social media posts. In: Proceedings of the 28th ACM international conference on multimedia. pp 1038\u20131046","DOI":"10.1145\/3394171.3413650"},{"key":"2398_CR16","doi-asserted-by":"crossref","unstructured":"Bao X, Tian M, Wang L, Zha Z, Qin B (2024) Contrastive pre-training with multi-level alignment for grounded multimodal named entity recognition. In: Proceedings of the 2024 international conference on multimedia retrieval. pp 795\u2013803","DOI":"10.1145\/3652583.3658011"},{"key":"2398_CR17","doi-asserted-by":"crossref","unstructured":"Lee K-H, Chen X, Hua G, Hu H, He X (2018) Stacked cross attention for image-text matching. In: Proceedings of the European conference on computer vision (ECCV). pp 201\u2013216","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"2398_CR18","doi-asserted-by":"crossref","unstructured":"Chen H, Ding G, Liu X, Lin Z, Liu J, Han J (2020) IMRAM: iterative matching with recurrent attention memory for cross-modal image-text retrieval. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp 12655\u201312663","DOI":"10.1109\/CVPR42600.2020.01267"},{"key":"2398_CR19","doi-asserted-by":"crossref","unstructured":"Zhelezniak V, Savkov A, Shen A, Hammerla NY (2019) Correlation coefficients and semantic textual similarity. arXiv preprint arXiv:1905.07790","DOI":"10.18653\/v1\/N19-1100"},{"key":"2398_CR20","first-page":"15394","volume":"35","author":"W Cao","year":"2022","unstructured":"Cao W, Zhang Y, Gao J, Cheng A, Cheng K, Cheng J (2022) PKD: general distillation framework for object detectors via Pearson correlation coefficient. Adv Neural Inf Process Syst 35:15394\u201315406","journal-title":"Adv Neural Inf Process Syst"},{"key":"2398_CR21","unstructured":"Li Y, Zemel R, Brockschmidt M, Tarlow D (2016) Gated graph sequence neural networks. In: Proceedings of ICLR\u201916"},{"key":"2398_CR22","unstructured":"Kipf TN, Welling M (2016) Semi-supervised classification with graph convolutional networks. In: International conference on learning representations"},{"issue":"1","key":"2398_CR23","doi-asserted-by":"publisher","first-page":"126","DOI":"10.1109\/TETC.2023.3238046","volume":"12","author":"M He","year":"2023","unstructured":"He M, Chen J, Gong M, Shao Z (2023) HDGCN: dual-channel graph convolutional network with higher-order information for robust feature learning. IEEE Trans Emerg Top Comput 12(1):126\u2013138","journal-title":"IEEE Trans Emerg Top Comput"},{"key":"2398_CR24","doi-asserted-by":"publisher","first-page":"475","DOI":"10.1016\/j.neunet.2023.10.050","volume":"169","author":"L Song","year":"2024","unstructured":"Song L, Li H, Tan Y, Li Z, Shang X (2024) Enhancing enterprise credit risk assessment with cascaded multi-level graph representation learning. Neural Netw 169:475\u2013484","journal-title":"Neural Netw"},{"issue":"1","key":"2398_CR25","doi-asserted-by":"publisher","first-page":"15","DOI":"10.1109\/TNNLS.2021.3089140","volume":"34","author":"J Ma","year":"2021","unstructured":"Ma J, Liu J, Wang Y, Li J, Liu T (2021) Relation-aware fine-grained reasoning network for textbook question answering. IEEE Trans Neural Netw Learn Syst 34(1):15\u201327","journal-title":"IEEE Trans Neural Netw Learn Syst"},{"key":"2398_CR26","doi-asserted-by":"publisher","first-page":"7294","DOI":"10.1109\/TMM.2024.3363641","volume":"26","author":"L Song","year":"2024","unstructured":"Song L, Chen S, Meng Z, Sun M, Shang X (2024) FMSA-SC: a fine-grained multimodal sentiment analysis dataset based on stock comment videos. IEEE Trans Multimedia 26:7294\u20137306","journal-title":"IEEE Trans Multimedia"},{"key":"2398_CR27","doi-asserted-by":"crossref","unstructured":"Li L, Gan Z, Cheng Y, Liu J (2019) Relation-aware graph attention network for visual question answering. In: Proceedings of the IEEE\/CVF international conference on computer vision. pp 10313\u201310322","DOI":"10.1109\/ICCV.2019.01041"},{"key":"2398_CR28","unstructured":"Zhang D, Wu L, Sun C, Li S, Zhu Q, Zhou G. Modeling both context-and speaker-sensitive dependence for emotion detection in multi-speaker conversations"},{"key":"2398_CR29","doi-asserted-by":"publisher","first-page":"19","DOI":"10.1162\/tacl_a_00252","volume":"7","author":"L Song","year":"2019","unstructured":"Song L, Gildea D, Zhang Y, Wang Z, Su J (2019) Semantic neural machine translation using AMR. Trans Assoc Comput Linguist 7:19\u201331","journal-title":"Trans Assoc Comput Linguist"},{"key":"2398_CR30","unstructured":"Xue M, Cai W, Su J, Song L, Ge Y, Liu Y, Wang B. Neural collective entity linking based on recurrent random walk network learning"},{"key":"2398_CR31","doi-asserted-by":"publisher","first-page":"3565","DOI":"10.1109\/TIP.2022.3159472","volume":"31","author":"Y Tu","year":"2022","unstructured":"Tu Y, Li L, Su L, Gao S, Yan C, Zha Z-J, Yu Z, Huang Q (2022) I2 transformer: intra-and inter-relation embedding transformer for TV show captioning. IEEE Trans Image Process 31:3565\u20133577","journal-title":"IEEE Trans Image Process"},{"key":"2398_CR32","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2022.109204","volume":"136","author":"Y Tu","year":"2023","unstructured":"Tu Y, Zhou C, Guo J, Li H, Gao S, Yu Z (2023) Relation-aware attention for video captioning via graph learning. Pattern Recognit 136:109204","journal-title":"Pattern Recognit"},{"key":"2398_CR33","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2023.109441","volume":"139","author":"Y Wang","year":"2023","unstructured":"Wang Y, Liu J, Ma J, Zeng H, Zhang L, Li J (2023) Dynamic dual graph networks for textbook question answering. Pattern Recognit 139:109441","journal-title":"Pattern Recognit"},{"key":"2398_CR34","unstructured":"Kenton JDM-WC, Toutanova LK (2019) BERT: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of NAACL-HLT. pp 4171\u20134186"},{"key":"2398_CR35","doi-asserted-by":"crossref","unstructured":"Fu Z, Mao Z, Song Y, Zhang Y (2023) Learning semantic relationship among instances for image-text matching. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp 15159\u201315168","DOI":"10.1109\/CVPR52729.2023.01455"},{"key":"2398_CR36","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez A.N, Kaiser \u0141, Polosukhin I (2017). Attention is all you need. In: Advances in neural information processing systems, vol 30"},{"key":"2398_CR37","first-page":"21","volume":"1050","author":"JL Ba","year":"2016","unstructured":"Ba JL, Kiros JR, Hinton GE (2016) Layer normalization. stat 1050:21","journal-title":"stat"},{"key":"2398_CR38","doi-asserted-by":"crossref","unstructured":"Cao P, Chen Y, Liu K, Zhao J, Liu S (2018) Adversarial transfer learning for Chinese named entity recognition with self-attention mechanism. In: Proceedings of the 2018 conference on empirical methods in natural language processing. pp 182\u2013192","DOI":"10.18653\/v1\/D18-1017"},{"key":"2398_CR39","doi-asserted-by":"crossref","unstructured":"Lison P, Barnes J, Hubin A, Touileb S (2020) Named entity recognition without labelled data: a weak supervision approach. In: Proceedings of the 58th annual meeting of the association for computational linguistics. pp 1518\u20131533","DOI":"10.18653\/v1\/2020.acl-main.139"},{"key":"2398_CR40","unstructured":"Huang Z, Xu W, Yu K (2015) Bidirectional LSTM-CRF models for sequence tagging. arXiv preprint arXiv:1508.01991"},{"key":"2398_CR41","doi-asserted-by":"crossref","unstructured":"Ma X, Hovy E (2016) End-to-end sequence labeling via bi-directional LSTM-CNNS-CRF. In: Proceedings of the 54th annual meeting of the association for computational linguistics, vol 1. Long papers. pp 1064\u20131074","DOI":"10.18653\/v1\/P16-1101"},{"key":"2398_CR42","doi-asserted-by":"crossref","unstructured":"Lample G, Ballesteros M, Subramanian S, Kawakami K, Dyer C (2016) Neural architectures for named entity recognition. In: Proceedings of the 2016 conference of the North American Chapter of the Association for computational linguistics: human language technologies. pp 260\u2013270","DOI":"10.18653\/v1\/N16-1030"},{"key":"2398_CR43","doi-asserted-by":"crossref","unstructured":"Chen X, Zhang N, Li L, Yao Y, Deng S, Tan C, Huang F, Si L, Chen H (2022) Good visual guidance make a better extractor: hierarchical visual prefix for multimodal entity and relation extraction. In: Findings of the Association for Computational Linguistics: NAACL 2022. pp 1607\u20131618","DOI":"10.18653\/v1\/2022.findings-naacl.121"},{"key":"2398_CR44","doi-asserted-by":"crossref","unstructured":"Wang X, Ye J, Li Z, Tian J, Jiang Y, Yan M, Zhang J, Xiao Y (2022) CAT-MNER: multimodal named entity recognition with knowledge-refined cross-modal attention. In: 2022 IEEE international conference on multimedia and expo (ICME). IEEE, pp 1\u20136","DOI":"10.1109\/ICME52920.2022.9859972"},{"key":"2398_CR45","doi-asserted-by":"crossref","unstructured":"Jia M, Shen L, Shen X, Liao L, Chen M, He X, Chen Z, Li J (2023) MNER-QG: an end-to-end MRC framework for multimodal named entity recognition with query grounding. In: Proceedings of the AAAI conference on artificial intelligence, vol 37. pp 8032\u20138040","DOI":"10.1609\/aaai.v37i7.25971"},{"key":"2398_CR46","doi-asserted-by":"crossref","unstructured":"Zhang X, Yuan J, Li L, Liu J (2023) Reducing the bias of visual objects in multimodal named entity recognition. In: Proceedings of the sixteenth ACM international conference on web search and data mining. pp 958\u2013966","DOI":"10.1145\/3539597.3570485"},{"key":"2398_CR47","doi-asserted-by":"crossref","unstructured":"Li X, Sun G, Liu X (2023) ESPVR: entity spans position visual regions for multimodal named entity recognition. In: Findings of the association for computational linguistics: EMNLP 2023. pp 7785\u20137794","DOI":"10.18653\/v1\/2023.findings-emnlp.522"}],"container-title":["International Journal of Machine Learning and Cybernetics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13042-024-02398-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s13042-024-02398-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13042-024-02398-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,7]],"date-time":"2025-04-07T07:24:38Z","timestamp":1744010678000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s13042-024-02398-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,27]]},"references-count":47,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2025,4]]}},"alternative-id":["2398"],"URL":"https:\/\/doi.org\/10.1007\/s13042-024-02398-8","relation":{},"ISSN":["1868-8071","1868-808X"],"issn-type":[{"value":"1868-8071","type":"print"},{"value":"1868-808X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,9,27]]},"assertion":[{"value":"10 April 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 September 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 September 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"Yongpeng Wang declares that he has no conflict of interest. Chunmao Jiang declares that he has no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"Not applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical approval and consent to participate"}},{"value":"Not applicable.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent for publication"}}]}}