{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,27]],"date-time":"2026-02-27T03:48:31Z","timestamp":1772164111640,"version":"3.50.1"},"reference-count":67,"publisher":"Springer Science and Business Media LLC","issue":"9","license":[{"start":{"date-parts":[[2025,5,22]],"date-time":"2025-05-22T00:00:00Z","timestamp":1747872000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,5,22]],"date-time":"2025-05-22T00:00:00Z","timestamp":1747872000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Knowl Inf Syst"],"published-print":{"date-parts":[[2025,9]]},"DOI":"10.1007\/s10115-025-02444-z","type":"journal-article","created":{"date-parts":[[2025,5,21]],"date-time":"2025-05-21T22:44:30Z","timestamp":1747867470000},"page":"7699-7726","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Cross-modal associated learning with spatial\u2013temporal attention for hot topic detection"],"prefix":"10.1007","volume":"67","author":[{"given":"Chengde","family":"Zhang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shiyu","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xinyu","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xia","family":"Xiao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,5,22]]},"reference":[{"issue":"1","key":"2444_CR1","doi-asserted-by":"publisher","DOI":"10.1098\/rsos.241599","volume":"12","author":"S Gardner","year":"2025","unstructured":"Gardner S, Bezati G, Godfrey T, Baird K, Bilal U, Loudon E, Young R, MacKenzie LE (2025) Analysis of over 1600 chemistry YouTube channels from 2005 to 2023. Royal Soc Open Sci 12(1):241599","journal-title":"Royal Soc Open Sci"},{"key":"2444_CR2","doi-asserted-by":"publisher","first-page":"124","DOI":"10.1109\/THMS.2015.2489681","volume":"46","author":"C Zhang","year":"2019","unstructured":"Zhang C, Wu X, Shyu M-L, Peng Q (2019) Integration of visual temporal information and textual distribution information for news web video event mining. IEEE Trans Hum-Mach Syst 46:124\u2013135. https:\/\/doi.org\/10.1109\/THMS.2015.2489681","journal-title":"IEEE Trans Hum-Mach Syst"},{"issue":"7","key":"2444_CR3","doi-asserted-by":"publisher","first-page":"1494","DOI":"10.1109\/TMM.2017.2674622","volume":"19","author":"W Xu","year":"2017","unstructured":"Xu W, Miao Z, Zhang X-P, Tian Y (2017) A hierarchical spatio-temporal model for human activity recognition. IEEE Trans Multimedia 19(7):1494\u20131509","journal-title":"IEEE Trans Multimedia"},{"key":"2444_CR4","doi-asserted-by":"crossref","unstructured":"Karpathy A, Toderici G, Shetty S, Leung T, Sukthankar R, Fei-Fei L (2014) Large-scale video classification with convolutional neural networks. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 1725\u20131732","DOI":"10.1109\/CVPR.2014.223"},{"key":"2444_CR5","first-page":"568","volume":"27","author":"K Simonyan","year":"2014","unstructured":"Simonyan K, Zisserman A (2014) Two-stream convolutional networks for action recognition in videos. Adv Neural Inf Process Syst 27:568\u2013576","journal-title":"Adv Neural Inf Process Syst"},{"key":"2444_CR6","doi-asserted-by":"crossref","unstructured":"Tran D, Bourdev L, Fergus R, Torresani L, Paluri M (2015) Learning spatiotemporal features with 3d convolutional networks. In: Proceedings of the IEEE international conference on computer vision, 4489\u20134497","DOI":"10.1109\/ICCV.2015.510"},{"key":"2444_CR7","doi-asserted-by":"crossref","unstructured":"Li Q, Qiu Z, Yao T, Mei T, Rui Y, Luo J (2016) Action recognition by learning deep multi-granular spatio-temporal video representation. In: Proceedings of the 2016 ACM on international conference on multimedia retrieval, 159\u2013166","DOI":"10.1145\/2911996.2912001"},{"key":"2444_CR8","unstructured":"Sharma S, Kiros R, Salakhutdinov R (2015) Action recognition using visual attention. arXiv preprint arXiv:1511.04119"},{"key":"2444_CR9","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1109\/TIP.2022.3201467","volume":"32","author":"F Wang","year":"2023","unstructured":"Wang F, Xu X, Chen Y, Li X (2023) Fuzzy semantics for arbitrary-shaped scene text detection. IEEE Trans Image Process 32:1\u201312. https:\/\/doi.org\/10.1109\/TIP.2022.3201467","journal-title":"IEEE Trans Image Process"},{"key":"2444_CR10","doi-asserted-by":"publisher","DOI":"10.1016\/j.ins.2023.119600","volume":"648","author":"D Yu","year":"2023","unstructured":"Yu D, Fang A, Xu Z (2023) Topic research in fuzzy domain: based on LDA topic modelling. Inform Sci 648:119600","journal-title":"Inform Sci"},{"key":"2444_CR11","doi-asserted-by":"publisher","first-page":"262","DOI":"10.1016\/j.ins.2021.04.029","volume":"570","author":"K Xiao","year":"2021","unstructured":"Xiao K, Qian Z, Qin B (2021) A graphical decomposition and similarity measurement approach for topic detection from online news. Inf Sci 570:262\u2013277","journal-title":"Inf Sci"},{"issue":"10","key":"2444_CR12","doi-asserted-by":"publisher","first-page":"1772","DOI":"10.3390\/sym13101772","volume":"13","author":"AK Nandanwar","year":"2021","unstructured":"Nandanwar AK, Choudhary J (2021) Semantic features with contextual knowledge-based web page categorization using the glove model and stacked BiLSTM. Symmetry 13(10):1772","journal-title":"Symmetry"},{"key":"2444_CR13","doi-asserted-by":"publisher","first-page":"98044","DOI":"10.1109\/ACCESS.2020.2995776","volume":"8","author":"W Liu","year":"2020","unstructured":"Liu W, Jiang L, Wu Y, Tang T, Li W (2020) Topic detection and tracking based on event ontology. IEEE Access 8:98044\u201398056","journal-title":"IEEE Access"},{"key":"2444_CR14","doi-asserted-by":"crossref","unstructured":"Jin O, Liu NN, Zhao K, Yu Y, Yang Q (2011) Transferring topical knowledge from auxiliary long texts for short text clustering. In: Proceedings of the 20th ACM international conference on information and knowledge management, 775\u2013784","DOI":"10.1145\/2063576.2063689"},{"issue":"3","key":"2444_CR15","doi-asserted-by":"publisher","first-page":"3209","DOI":"10.1007\/s11042-017-5058-2","volume":"77","author":"Z Zhao","year":"2018","unstructured":"Zhao Z, Xiang R, Su F (2018) Complex event detection via attention-based video representation and classification. Multimedia Tools Appl 77(3):3209\u20133227","journal-title":"Multimedia Tools Appl"},{"issue":"3","key":"2444_CR16","doi-asserted-by":"publisher","first-page":"1549","DOI":"10.1007\/s11276-019-02009-3","volume":"26","author":"J Sun","year":"2020","unstructured":"Sun J, Li L, Li W, Zhang J, Yan C (2020) Enabling 5g: sentimental image dominant graph topic model for cross-modality topic detection. Wireless Netw 26(3):1549\u20131561","journal-title":"Wireless Netw"},{"issue":"3","key":"2444_CR17","doi-asserted-by":"publisher","first-page":"773","DOI":"10.1109\/TCSVT.2018.2808685","volume":"29","author":"Y Peng","year":"2018","unstructured":"Peng Y, Zhao Y, Zhang J (2018) Two-stream collaborative learning with spatial-temporal attention for video classification. IEEE Trans Circ Syst Video Technol 29(3):773\u2013786. https:\/\/doi.org\/10.1109\/TCSVT.2018.2808685","journal-title":"IEEE Trans Circ Syst Video Technol"},{"key":"2444_CR18","doi-asserted-by":"publisher","first-page":"9225","DOI":"10.1109\/ACCESS.2018.2886366","volume":"7","author":"C Zhang","year":"2018","unstructured":"Zhang C, Lu S, Zhang C, Xiao X, Wang Q, Chen G (2018) A novel hot topic detection framework with integration of image and short text information from twitter. IEEE Access 7:9225\u20139231","journal-title":"IEEE Access"},{"key":"2444_CR19","doi-asserted-by":"publisher","first-page":"1745","DOI":"10.1109\/TIP.2022.3199107","volume":"32","author":"B Li","year":"2023","unstructured":"Li B, Xiao C, Wang L, Wang Y, Lin Z, Li M, An W, Guo Y (2023) Dense nested attention network for infrared small target detection. IEEE Trans Image Process 32:1745\u20131758. https:\/\/doi.org\/10.1109\/TIP.2022.3199107","journal-title":"IEEE Trans Image Process"},{"key":"2444_CR20","doi-asserted-by":"publisher","first-page":"1559","DOI":"10.1109\/TIP.2022.3144017","volume":"31","author":"Y Dong","year":"2022","unstructured":"Dong Y, Liu Q, Du B, Zhang L (2022) Weighted feature fusion of convolutional neural network and graph attention network for hyperspectral image classification. IEEE Trans Image Process 31:1559\u20131572. https:\/\/doi.org\/10.1109\/TIP.2022.3144017","journal-title":"IEEE Trans Image Process"},{"issue":"11","key":"2444_CR21","doi-asserted-by":"publisher","first-page":"3218","DOI":"10.1109\/TCYB.2017.2762344","volume":"48","author":"S Zhao","year":"2017","unstructured":"Zhao S, Gao Y, Ding G, Chua T-S (2017) Real-time multimedia social event detection in microblog. IEEE Trans Cybern 48(11):3218\u20133231","journal-title":"IEEE Trans Cybern"},{"key":"2444_CR22","doi-asserted-by":"publisher","first-page":"187","DOI":"10.1016\/j.future.2021.01.014","volume":"118","author":"K Kumari","year":"2021","unstructured":"Kumari K, Singh JP, Dwivedi YK, Rana NP (2021) Multi-modal aggression identification using convolutional neural network and binary particle swarm optimization. Futur Gener Comput Syst 118:187\u2013197","journal-title":"Futur Gener Comput Syst"},{"issue":"3","key":"2444_CR23","doi-asserted-by":"publisher","first-page":"897","DOI":"10.1007\/s11831-020-09400-w","volume":"28","author":"K Thyagharajan","year":"2021","unstructured":"Thyagharajan K, Kalaiarasi G (2021) A review on near-duplicate detection of images using computer vision techniques. Archiv Comput Methods Eng 28(3):897\u2013916","journal-title":"Archiv Comput Methods Eng"},{"key":"2444_CR24","doi-asserted-by":"publisher","first-page":"26","DOI":"10.1016\/j.sigpro.2015.08.002","volume":"120","author":"C Zhang","year":"2016","unstructured":"Zhang C, Liu D, Wu X, Zhao G, Shyu M-L, Peng Q (2016) Near-duplicate segments based news web video event mining. Signal Process 120:26\u201335","journal-title":"Signal Process"},{"key":"2444_CR25","doi-asserted-by":"publisher","first-page":"420","DOI":"10.1016\/j.ins.2022.12.081","volume":"630","author":"L Zhou","year":"2023","unstructured":"Zhou L, Mao Y, Xiong N, Wang Y, Feng F (2023) BTD: an effective business-related hot topic detection scheme in professional social networks. Inf Sci 630:420\u2013442","journal-title":"Inf Sci"},{"issue":"16","key":"2444_CR26","doi-asserted-by":"publisher","first-page":"11815","DOI":"10.1007\/s00521-023-08323-4","volume":"35","author":"C Zhang","year":"2023","unstructured":"Zhang C, Liu G, Xiao X (2023) Cross-media correlation learning for web video event mining with integrated text semantics and network structural information. Neural Compu Appl 35(16):11815\u201311831","journal-title":"Neural Compu Appl"},{"key":"2444_CR27","doi-asserted-by":"publisher","first-page":"148","DOI":"10.1016\/j.neucom.2022.06.028","volume":"502","author":"C Zhang","year":"2022","unstructured":"Zhang C, Lei Y, Xiao X, Chen X (2022) Cross-media video event mining based on attention graph structure learning. Neurocomputing 502:148\u2013158","journal-title":"Neurocomputing"},{"issue":"3","key":"2444_CR28","doi-asserted-by":"publisher","first-page":"773","DOI":"10.1109\/TCSVT.2018.2808685","volume":"29","author":"Y Peng","year":"2018","unstructured":"Peng Y, Zhao Y, Zhang J (2018) Two-stream collaborative learning with spatial-temporal attention for video classification. IEEE Trans Circ Syst Video Technol 29(3):773\u2013786. https:\/\/doi.org\/10.1109\/TCSVT.2018.2808685","journal-title":"IEEE Trans Circ Syst Video Technol"},{"key":"2444_CR29","doi-asserted-by":"crossref","unstructured":"Wang X, Zhu L, Yang Y (2021) T2vlad: global-local sequence alignment for text-video retrieval. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, 5079\u20135088","DOI":"10.1109\/CVPR46437.2021.00504"},{"key":"2444_CR30","unstructured":"Zheng M, You S, Huang L, Su X, Wang F, Qian C, Wang X, Xu C (2023) Cone: contrast your neighbours for supervised image classification. arXiv preprint arXiv:2308.10761"},{"issue":"2\u20133","key":"2444_CR31","doi-asserted-by":"publisher","first-page":"107","DOI":"10.1007\/s11263-005-1838-7","volume":"64","author":"I Laptev","year":"2005","unstructured":"Laptev I (2005) On space-time interest points. Int J Comput Vis 64(2\u20133):107\u2013123","journal-title":"Int J Comput Vis"},{"key":"2444_CR32","doi-asserted-by":"crossref","unstructured":"Dalal N, Triggs B (2005) Histograms of oriented gradients for human detection. In: 2005 IEEE computer society conference on computer vision and pattern recognition (CVPR\u201905), IEEE, vol 1, pp 886\u2013893","DOI":"10.1109\/CVPR.2005.177"},{"key":"2444_CR33","doi-asserted-by":"crossref","unstructured":"Dalal N, Triggs B, Schmid C (2006) Human detection using oriented histograms of flow and appearance. In: Computer Vision\u2013ECCV 2006: 9th European conference on computer vision, Graz, Austria, May 7-13, 2006. Proceedings, Part II, Springer, vol 9, pp 428\u2013441","DOI":"10.1007\/11744047_33"},{"issue":"3","key":"2444_CR34","doi-asserted-by":"publisher","first-page":"624","DOI":"10.1109\/TCSVT.2016.2589838","volume":"27","author":"Y Xian","year":"2016","unstructured":"Xian Y, Rong X, Yang X, Tian Y (2016) Evaluation of low-level features for real-world surveillance event detection. IEEE Trans Circ Syst Video Technol 27(3):624\u2013634","journal-title":"IEEE Trans Circ Syst Video Technol"},{"key":"2444_CR35","doi-asserted-by":"crossref","unstructured":"Liu J, Kuipers B, Savarese S (2011) Recognizing human actions by attributes. In: CVPR 2011, IEEE, 3337\u20133344","DOI":"10.1109\/CVPR.2011.5995353"},{"issue":"3","key":"2444_CR36","doi-asserted-by":"publisher","first-page":"589","DOI":"10.1109\/TCSVT.2016.2615443","volume":"27","author":"H Fradi","year":"2016","unstructured":"Fradi H, Luvison B, Pham QC (2016) Crowd behavior analysis using local mid-level visual descriptors. IEEE Trans Circ Syst Video Technol 27(3):589\u2013602","journal-title":"IEEE Trans Circ Syst Video Technol"},{"issue":"3","key":"2444_CR37","doi-asserted-by":"publisher","first-page":"635","DOI":"10.1109\/TCSVT.2016.2593609","volume":"27","author":"Y Zhang","year":"2016","unstructured":"Zhang Y, Qin L, Ji R, Zhao S, Huang Q, Luo J (2016) Exploring coherent motion patterns via structured trajectory learning for crowd mood modeling. IEEE Trans Circ Syst Video Technol 27(3):635\u2013648","journal-title":"IEEE Trans Circ Syst Video Technol"},{"key":"2444_CR38","doi-asserted-by":"crossref","unstructured":"Wu Z, Jiang Y-G, Wang X, Ye H, Xue X (2016) Multi-stream multi-class fusion of deep networks for video classification. In: Proceedings of the 24th ACM international conference on multimedia, 791\u2013800","DOI":"10.1145\/2964284.2964328"},{"issue":"6","key":"2444_CR39","doi-asserted-by":"publisher","first-page":"1263","DOI":"10.1109\/TCSVT.2015.2511543","volume":"27","author":"G-S Xie","year":"2015","unstructured":"Xie G-S, Zhang X-Y, Yan S, Liu C-L (2015) Hybrid CNN and dictionary-based models for scene recognition and domain adaptation. IEEE Trans Circ Syst Video Technol 27(6):1263\u20131274","journal-title":"IEEE Trans Circ Syst Video Technol"},{"key":"2444_CR40","unstructured":"M, GF, H, ZP (2018) Semantic relationship recognition of oil documents based on improved word vector. Comput Syst Appl 27(8)"},{"key":"2444_CR41","doi-asserted-by":"crossref","unstructured":"Wang L, Xiong Y, Wang Z, Qiao Y, Lin D, Tang X, Van Gool L (2016) Temporal segment networks: towards good practices for deep action recognition. In: European conference on computer vision, pp 20\u201336","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"2444_CR42","doi-asserted-by":"crossref","unstructured":"Feichtenhofer C, Pinz A, Zisserman A (2016) Convolutional two-stream network fusion for video action recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 1933\u20131941","DOI":"10.1109\/CVPR.2016.213"},{"issue":"2","key":"2444_CR43","first-page":"449","volume":"47","author":"Y Wei","year":"2016","unstructured":"Wei Y, Zhao Y, Lu C, Wei S, Liu L, Zhu Z, Yan S (2016) Cross-modal retrieval with CNN visual features: a new baseline. IEEE Trans Cybern 47(2):449\u2013460","journal-title":"IEEE Trans Cybern"},{"key":"2444_CR44","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-024-20220-z","author":"MS Hossain","year":"2024","unstructured":"Hossain MS, Aktar S, Hossen MB, Hossain MA, Gu N, Huang Z (2024) CsDNet: cross-sketch with dual gated attention for fine-grained image captioning network. Multimedia Tools Appl. https:\/\/doi.org\/10.1007\/s11042-024-20220-z","journal-title":"Multimedia Tools Appl"},{"key":"2444_CR45","doi-asserted-by":"crossref","unstructured":"Liu D, Greene D, Dong R (2022) A novel perspective to look at attention: Bi-level attention-based explainable topic modeling for news classification. arXiv preprint arXiv:2203.07216","DOI":"10.18653\/v1\/2022.findings-acl.178"},{"key":"2444_CR46","doi-asserted-by":"crossref","unstructured":"Kim H-J, Lee JS, Yang H-S (2007) Human action recognition using a modified convolutional neural network. In: Advances in Neural Networks\u2013ISNN 2007: 4th International Symposium on Neural Networks, ISNN 2007, Nanjing, China, June 3-7, 2007, Proceedings, Part II 4, 715\u2013723 . Springer","DOI":"10.1007\/978-3-540-72393-6_85"},{"key":"2444_CR47","doi-asserted-by":"crossref","unstructured":"Richard A, Kuehne H, Gall J (2018) Action sets: weakly supervised action segmentation without ordering constraints. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 5987\u20135996","DOI":"10.1109\/CVPR.2018.00627"},{"key":"2444_CR48","doi-asserted-by":"publisher","unstructured":"Guo S, Lin Y, Feng N, Song C, Wan H (2019) Attention based spatial-temporal graph convolutional networks for traffic flow forecasting. In: Proceedings of the AAAI conference on artificial intelligence, 33, 922\u2013929. https:\/\/doi.org\/10.1609\/aaai.v33i01.3301922","DOI":"10.1609\/aaai.v33i01.3301922"},{"key":"2444_CR49","doi-asserted-by":"publisher","DOI":"10.1016\/j.displa.2024.102941","volume":"87","author":"MS Hossain","year":"2025","unstructured":"Hossain MS, Aktar S, Hossain MA et al (2025) CM-SC: Cross-modal spatial-channel attention network for image captioning. Displays 87:102941","journal-title":"Displays"},{"issue":"7","key":"2444_CR50","doi-asserted-by":"publisher","first-page":"4043","DOI":"10.1007\/s10115-024-02080-z","volume":"66","author":"Y Xie","year":"2024","unstructured":"Xie Y, Wu J, Zhou Y (2024) GTHP: a novel graph transformer Hawkes process for spatiotemporal event prediction. Knowl Inform Syst 66(7):4043\u20134062","journal-title":"Knowl Inform Syst"},{"issue":"1","key":"2444_CR51","doi-asserted-by":"publisher","first-page":"229","DOI":"10.1109\/TMM.2019.2924576","volume":"22","author":"C Yan","year":"2019","unstructured":"Yan C, Tu Y, Wang X, Zhang Y, Hao X, Zhang Y, Dai Q (2019) Stat: spatial-temporal attention mechanism for video captioning. IEEE Trans Multimedia 22(1):229\u2013241. https:\/\/doi.org\/10.1109\/TMM.2019.2924576","journal-title":"IEEE Trans Multimedia"},{"issue":"1","key":"2444_CR52","doi-asserted-by":"publisher","first-page":"221","DOI":"10.1109\/TPAMI.2012.59","volume":"35","author":"S Ji","year":"2012","unstructured":"Ji S, Xu W, Yang M, Yu K (2012) 3d convolutional neural networks for human action recognition. IEEE Trans Pattern Anal Mach Intell 35(1):221\u2013231","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"issue":"5","key":"2444_CR53","doi-asserted-by":"publisher","first-page":"2061","DOI":"10.1007\/s10115-022-01823-0","volume":"65","author":"X Hou","year":"2023","unstructured":"Hou X, Ma R, Yan L, Ma Z (2023) DAuCNet: deep autoregressive framework for temporal link prediction combining copy mechanism network. Knowl Inf Syst 65(5):2061\u20132085","journal-title":"Knowl Inf Syst"},{"key":"2444_CR54","doi-asserted-by":"crossref","unstructured":"Hossain MS, Aktar S, Gu N, et al (2025) GeoSCN: a novel multimodal self-attention to integrate geometric information on spatial-channel network for fine-grained image captioning. Expert Syst Appl, 126692","DOI":"10.1016\/j.eswa.2025.126692"},{"issue":"11","key":"2444_CR55","doi-asserted-by":"publisher","first-page":"4687","DOI":"10.1007\/s10115-023-01887-6","volume":"65","author":"Y Xie","year":"2023","unstructured":"Xie Y, Xiong Y, Zhang J, Chen C, Zhang Y, Zhao J, Jiao Y, Zhao J, Zhu Y (2023) Temporal super-resolution traffic flow forecasting via continuous-time network dynamics. Knowl Inf Syst 65(11):4687\u20134712","journal-title":"Knowl Inf Syst"},{"key":"2444_CR56","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"2444_CR57","unstructured":"Simonyan K, Zisserman A (2014) Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556"},{"issue":"4","key":"2444_CR58","first-page":"884","volume":"30","author":"Y Zhuo","year":"2019","unstructured":"Zhuo Y, Peng Y et al (2019) Cross-media deep fine-grained association learning method. J Comput Softw 30(4):884\u2013895","journal-title":"J Comput Softw"},{"key":"2444_CR59","unstructured":"Devlin J, Chang M-W, Lee K, Toutanova K (2018) Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805"},{"key":"2444_CR60","doi-asserted-by":"crossref","unstructured":"He K, Fan H, Wu Y, Xie S, Girshick R (2020) Momentum contrast for unsupervised visual representation learning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, 9729\u20139738","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"2444_CR61","doi-asserted-by":"crossref","unstructured":"Wu Y, Xie R, Zhu Y, Ao X, Chen X, Zhang X, Zhuang F, Lin L, He Q (2022) Multi-view multi-behavior contrastive learning in recommendation. In: Database Systems for Advanced Applications: 27th International Conference, DASFAA 2022, Virtual Event, April 11\u201314, 2022, Proceedings, Part II, 166\u2013182. Springer","DOI":"10.1007\/978-3-031-00126-0_11"},{"key":"2444_CR62","doi-asserted-by":"crossref","unstructured":"Wang X, Farhadi A, Gupta A (2016) Actions$$\\sim $$ transformations. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 2658\u20132667","DOI":"10.1109\/CVPR.2016.291"},{"key":"2444_CR63","doi-asserted-by":"crossref","unstructured":"Cheng H-T, Koc L, Harmsen J, Shaked T, Chandra T, Aradhye H, Anderson G, Corrado G, Chai W, Ispir M, et al (2016) Wide & deep learning for recommender systems. In: Proceedings of the 1st workshop on deep learning for recommender systems, 7\u201310","DOI":"10.1145\/2988450.2988454"},{"key":"2444_CR64","doi-asserted-by":"crossref","unstructured":"Singhal S, Dhawan M, Shah RR, Kumaraguru P (2021) Inter-modality discordance for multimodal fake news detection. In: ACM Multimedia Asia, 1\u20137","DOI":"10.1145\/3469877.3490614"},{"key":"2444_CR65","doi-asserted-by":"crossref","unstructured":"Singhal S, Kabra A, Sharma M, Shah RR, Chakraborty T, Kumaraguru P (2020) Spotfake+: a multimodal framework for fake news detection via transfer learning (student abstract). In: Proceedings of the AAAI conference on artificial intelligence, 34, 13915\u201313916","DOI":"10.1609\/aaai.v34i10.7230"},{"key":"2444_CR66","doi-asserted-by":"publisher","unstructured":"Huang S-C, Shen L, Lungren MP, Yeung S (2021) Gloria: A multimodal global-local representation learning framework for label-efficient medical image recognition. In: Proceedings of the IEEE\/CVF international conference on computer vision, Montreal, QC, Canada, 3942\u20133951. https:\/\/doi.org\/10.1109\/ICCV48922.2021.00391","DOI":"10.1109\/ICCV48922.2021.00391"},{"key":"2444_CR67","unstructured":"Kiela D, Bhooshan S, Firooz H, Perez E, Testuggine D (2019) Supervised multimodal bitransformers for classifying images and text. arXiv preprint arXiv:1909.02950"}],"container-title":["Knowledge and Information Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10115-025-02444-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10115-025-02444-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10115-025-02444-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,6]],"date-time":"2025-09-06T15:12:58Z","timestamp":1757171578000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10115-025-02444-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,22]]},"references-count":67,"journal-issue":{"issue":"9","published-print":{"date-parts":[[2025,9]]}},"alternative-id":["2444"],"URL":"https:\/\/doi.org\/10.1007\/s10115-025-02444-z","relation":{},"ISSN":["0219-1377","0219-3116"],"issn-type":[{"value":"0219-1377","type":"print"},{"value":"0219-3116","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,5,22]]},"assertion":[{"value":"19 April 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 April 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"5 April 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 May 2025","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no Conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}