{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,19]],"date-time":"2026-04-19T18:50:09Z","timestamp":1776624609596,"version":"3.51.2"},"reference-count":45,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2023,10,10]],"date-time":"2023-10-10T00:00:00Z","timestamp":1696896000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,10,10]],"date-time":"2023-10-10T00:00:00Z","timestamp":1696896000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"Zhejiang Provincial Natural Science Foundation of China","award":["LGN22F020002"],"award-info":[{"award-number":["LGN22F020002"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Pattern Anal Applic"],"published-print":{"date-parts":[[2023,11]]},"DOI":"10.1007\/s10044-023-01204-5","type":"journal-article","created":{"date-parts":[[2023,10,10]],"date-time":"2023-10-10T07:02:42Z","timestamp":1696921362000},"page":"1793-1804","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["Unsupervised multimodal learning for image-text relation classification in tweets"],"prefix":"10.1007","volume":"26","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2923-3281","authenticated-orcid":false,"given":"Lin","family":"Sun","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qingyuan","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Long","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yindu","family":"Su","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2023,10,10]]},"reference":[{"key":"1204_CR1","doi-asserted-by":"publisher","first-page":"31","DOI":"10.1007\/s13735-019-00187-6","volume":"9","author":"C Otto","year":"2020","unstructured":"Otto C, Springstein M, Anand A (2020) Ewerth R Characterization and classification of semantic image-text relations. Int J Multimed Inf Retrieval 9:31\u201345","journal-title":"Int J Multimed Inf Retrieval"},{"key":"1204_CR2","doi-asserted-by":"crossref","unstructured":"Sun L, Wang J, Zhang K, Su Y, Weng F (2021) Rpbert: A text-image relation propagation-based BERT model for multimodal NER. In: AAAI, pp 13860\u201313868","DOI":"10.1609\/aaai.v35i15.17633"},{"key":"1204_CR3","doi-asserted-by":"crossref","unstructured":"Ju X, Zhang D, Xiao R, Li J, Li S, Zhang M, Zhou G (2021) Joint multi-modal aspect-sentiment analysis with auxiliary cross-modal relation detection. In: EMNLP, pp 4395\u20134405","DOI":"10.18653\/v1\/2021.emnlp-main.360"},{"key":"1204_CR4","unstructured":"Sosea T, Sirbu I, Caragea C, Caragea D, Rebedea T (2021) Using the image-text relationship to improve multimodal disaster tweet classification. In: ISCRAM 2021 conference proceedings\u201418th international conference on information systems for crisis response and management, pp 691\u2013704"},{"key":"1204_CR5","doi-asserted-by":"crossref","unstructured":"Vempala A, Preotiuc-Pietro D (2019) Categorizing and inferring the relationship between the text and image of twitter posts. In: Annual meeting of the association for computational linguistics","DOI":"10.18653\/v1\/P19-1272"},{"issue":"3","key":"1204_CR6","doi-asserted-by":"publisher","first-page":"337","DOI":"10.1177\/1470357205055928","volume":"4","author":"R Martinec","year":"2005","unstructured":"Martinec R, Salway A (2005) A system for image-text relations in new (and old) media. Vis Commun 4(3):337\u2013371","journal-title":"Vis Commun"},{"key":"1204_CR7","doi-asserted-by":"crossref","unstructured":"Landis JR, Koch GG (1977) The measurement of observer agreement for categorical data. Biometrics 159\u2013174","DOI":"10.2307\/2529310"},{"issue":"1","key":"1204_CR8","first-page":"13","volume":"23","author":"J Carletta","year":"1997","unstructured":"Carletta J, Isard A, Isard S, Kowtko JC, Doherty-Sneddon G, Anderson AH (1997) The reliability of a dialogue structure coding scheme. COLING 23(1):13\u201331","journal-title":"COLING"},{"issue":"4","key":"1204_CR9","first-page":"555","volume":"34","author":"R Artstein","year":"2008","unstructured":"Artstein R, Poesio M (2008) Inter-coder agreement for computational linguistics. COLING 34(4):555\u2013596","journal-title":"COLING"},{"issue":"6","key":"1204_CR10","doi-asserted-by":"publisher","first-page":"647","DOI":"10.1108\/00220410310506303","volume":"59","author":"EE Marsh","year":"2003","unstructured":"Marsh EE, White MD (2003) A taxonomy of relationships between images and text. J Document 59(6):647\u2013672","journal-title":"J Document"},{"issue":"4","key":"1204_CR11","doi-asserted-by":"publisher","first-page":"34","DOI":"10.1145\/2611388","volume":"10","author":"Z Wang","year":"2014","unstructured":"Wang Z, Cui P, Xie L, Zhu W, Rui Y, Yang S (2014) Bilateral correspondence model for words-and-pictures association in multimedia-rich microblogs. ACM Trans Multim Comput Commun Appl 10(4):34\u201313421","journal-title":"ACM Trans Multim Comput Commun Appl"},{"key":"1204_CR12","doi-asserted-by":"crossref","unstructured":"Chen T, Lu D, Kan MY, Cui P (2013) Understanding and classifying image tweets","DOI":"10.1145\/2502081.2502203"},{"key":"1204_CR13","doi-asserted-by":"crossref","unstructured":"Chen T, SalahEldeen H, He X, Kan MY, Lu D (2015) Velda: relating an image tweet\u2019s text and images. In: AAAI conference on artificial intelligence","DOI":"10.1609\/aaai.v29i1.9168"},{"key":"1204_CR14","unstructured":"Zhang M, Hwa R, Kovashka A (2018) Equal but not the same: understanding the implicit relationship between persuasive images and text. In: British machine vision conference"},{"key":"1204_CR15","doi-asserted-by":"publisher","first-page":"43","DOI":"10.1007\/s13735-017-0142-y","volume":"7","author":"CA Henning","year":"2017","unstructured":"Henning CA, Ewerth R (2017) Estimating the information gap between textual and visual representations. Int J Multimed Inf Retrieval 7:43\u201356","journal-title":"Int J Multimed Inf Retrieval"},{"key":"1204_CR16","doi-asserted-by":"crossref","unstructured":"Kruk J, Lubin J, Sikka K, Lin X, Jurafsky D, Divakaran A (2019) Integrating text and image: Determining multimodal document intent in instagram posts. In: Conference on empirical methods in natural language processing","DOI":"10.18653\/v1\/D19-1469"},{"key":"1204_CR17","doi-asserted-by":"crossref","unstructured":"Caron M, Bojanowski P, Joulin A, Douze M (2018) Deep clustering for unsupervised learning of visual features. In: European conference on computer vision","DOI":"10.1007\/978-3-030-01264-9_9"},{"key":"1204_CR18","unstructured":"Alwassel H, Mahajan D, Korbar B, Torresani L, Ghanem B, Tran D (2020) Self-supervised learning by cross-modal audio-video clustering. In: Advances in neural information processing systems, vol 33, pp 9758\u20139770"},{"key":"1204_CR19","unstructured":"Asano YM, Rupprecht C, Vedaldi A (2020) Self-labelling via simultaneous clustering and representation learning. In: International conference on learning representations"},{"key":"1204_CR20","unstructured":"Caron M, Misra I, Mairal J, Goyal P, Bojanowski P, Joulin A (2020) Unsupervised learning of visual features by contrasting cluster assignments. In: Neural information processing systems"},{"issue":"1","key":"1204_CR21","doi-asserted-by":"publisher","first-page":"276","DOI":"10.1109\/TIP.2016.2624140","volume":"26","author":"Z Li","year":"2016","unstructured":"Li Z, Tang J (2016) Weakly supervised deep matrix factorization for social image understanding. IEEE Trans Image Process 26(1):276\u2013288","journal-title":"IEEE Trans Image Process"},{"issue":"10","key":"1204_CR22","doi-asserted-by":"publisher","first-page":"2085","DOI":"10.1109\/TPAMI.2015.2400461","volume":"37","author":"Z Li","year":"2015","unstructured":"Li Z, Liu J, Tang J, Lu H (2015) Robust structured subspace learning for data representation. IEEE Trans Pattern Anal Mach Intell 37(10):2085\u20132098","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"issue":"9","key":"1204_CR23","doi-asserted-by":"publisher","first-page":"2070","DOI":"10.1109\/TPAMI.2018.2852750","volume":"41","author":"Z Li","year":"2019","unstructured":"Li Z, Tang J, Mei T (2019) Deep collaborative embedding for social image understanding. IEEE Trans Pattern Anal Mach Intell 41(9):2070\u20132083","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"1204_CR24","doi-asserted-by":"publisher","first-page":"2265","DOI":"10.1007\/s11263-020-01331-0","volume":"128","author":"Z Li","year":"2020","unstructured":"Li Z, Tang J, Zhang L, Yang J (2020) Weakly-supervised semantic guided hashing for social image retrieval. Int J Comput Vision 128:2265\u20132278","journal-title":"Int J Comput Vision"},{"key":"1204_CR25","unstructured":"Devlin J, Chang M, Lee K, Toutanova K (2019) BERT: pre-training of deep bidirectional transformers for language understanding. In: NAACL, pp 4171\u20134186"},{"key":"1204_CR26","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: CVPR, pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"1204_CR27","doi-asserted-by":"publisher","first-page":"321","DOI":"10.1613\/jair.953","volume":"16","author":"NV Chawla","year":"2002","unstructured":"Chawla NV, Bowyer KW, Hall LO, Kegelmeyer WP (2002) SMOTE: synthetic minority over-sampling technique. J Artif Intell Res 16:321\u2013357","journal-title":"J Artif Intell Res"},{"issue":"2","key":"1204_CR28","first-page":"539","volume":"39","author":"XY Liu","year":"2008","unstructured":"Liu XY, Wu J, Zhou ZH (2008) Exploratory undersampling for class-imbalance learning. IEEE Trans Syst Man Cyber B 39(2):539\u2013550","journal-title":"IEEE Trans Syst Man Cyber B"},{"issue":"17","key":"1204_CR29","first-page":"1","volume":"18","author":"G Lema\u00eetre","year":"2017","unstructured":"Lema\u00eetre G, Nogueira F, Aridas CK (2017) Imbalanced-learn: a python toolbox to tackle the curse of imbalanced datasets in machine learning. JMLR 18(17):1\u20135","journal-title":"JMLR"},{"key":"1204_CR30","doi-asserted-by":"crossref","unstructured":"He K, Fan H, Wu Y, Xie S, Girshick R (2020) Momentum contrast for unsupervised visual representation learning. In: CVPR, pp 9726\u20139735","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"1204_CR31","unstructured":"Xie J, Girshick RB, Farhadi A (2016) Unsupervised deep embedding for clustering analysis. In: Balcan M, Weinberger KQ (eds) ICML, pp 478\u2013487"},{"issue":"4","key":"1204_CR32","first-page":"927","volume":"20","author":"Y Hu","year":"2018","unstructured":"Hu Y, Zheng L, Yang Y, Huang Y (2018) Twitter100k: a real-world dataset for weakly supervised cross-media retrieval. IEEE TMM 20(4):927\u2013938","journal-title":"IEEE TMM"},{"key":"1204_CR33","unstructured":"Radford A, Kim J.W, Hallacy C, Ramesh A, Goh G, Agarwal S, Sastry G, Askell A, Mishkin P, Clark J, et\u00a0al (2021) Learning transferable visual models from natural language supervision. In: ICML, pp 8748\u20138763"},{"key":"1204_CR34","doi-asserted-by":"crossref","unstructured":"Szegedy C, Liu W, Jia Y, Sermanet P, Reed SE, Anguelov D, Erhan D, Vanhoucke V, Rabinovich A (2015) Going deeper with convolutions. In: CVPR, pp 1\u20139","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"1204_CR35","doi-asserted-by":"crossref","unstructured":"Hessel J, Lee L (2020) Does my multimodal model learn cross-modal interactions? it\u2019s harder to tell than you might think! In: EMNLP, pp 861\u2013877","DOI":"10.18653\/v1\/2020.emnlp-main.62"},{"key":"1204_CR36","unstructured":"Tan M, Le Q (2019) Efficientnet: Rethinking model scaling for convolutional neural networks. In: ICML, pp 6105\u20136114"},{"key":"1204_CR37","unstructured":"Liu Y, Ott M, Goyal N, Du J, Joshi M, Chen D, Levy O, Lewis M, Zettlemoyer L, Stoyanov V (2019) Roberta: a robustly optimized BERT pretraining approach. arXiv preprint arXiv:1907.11692"},{"key":"1204_CR38","doi-asserted-by":"crossref","unstructured":"Tan H, Bansal M (2019) LXMERT: Learning cross-modality encoder representations from transformers. In: EMNLP, pp. 5100\u20135111","DOI":"10.18653\/v1\/D19-1514"},{"key":"1204_CR39","doi-asserted-by":"crossref","unstructured":"Fu J, Xu S, Liu H, Liu Y, Xie N, Wang CC, Liu J, Sun Y, Wang B (2022) Cma-clip: Cross-modality attention clip for text-image classification. In: 2022 IEEE international conference on image processing (ICIP), pp 2846\u20132850","DOI":"10.1109\/ICIP46576.2022.9897323"},{"key":"1204_CR40","unstructured":"Kingma D.P, Ba J (2015) Adam: A method for stochastic optimization. In: ICLR"},{"key":"1204_CR41","unstructured":"MacQueen J, et\u00a0al (1967) Some methods for classification and analysis of multivariate observations. In: Proceedings of the fifth berkeley symposium on mathematical statistics and probability, pp 281\u2013297"},{"key":"1204_CR42","unstructured":"Bishop CM (2007) Pattern recognition and machine learning, 5th Edition. In: Information science and statistics"},{"key":"1204_CR43","unstructured":"Ester M, Kriegel H, Sander J, Xu X (1996) A density-based algorithm for discovering clusters in large spatial databases with noise. In: KDD, pp 226\u2013231"},{"key":"1204_CR44","doi-asserted-by":"publisher","first-page":"395","DOI":"10.1007\/s11222-007-9033-z","volume":"17","author":"U Von Luxburg","year":"2007","unstructured":"Von Luxburg U (2007) A tutorial on spectral clustering. Stat Comput 17:395\u2013416","journal-title":"Stat Comput"},{"key":"1204_CR45","doi-asserted-by":"crossref","unstructured":"Schwartz H.A, Giorgi S, Sap M, Crutchley P, Eichstaedt J, Ungar L (2017) Dlatk: differential language analysis toolkit. In: EMNLP, pp 55\u201360","DOI":"10.18653\/v1\/D17-2010"}],"container-title":["Pattern Analysis and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10044-023-01204-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10044-023-01204-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10044-023-01204-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,11,21]],"date-time":"2023-11-21T20:17:37Z","timestamp":1700597857000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10044-023-01204-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,10]]},"references-count":45,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2023,11]]}},"alternative-id":["1204"],"URL":"https:\/\/doi.org\/10.1007\/s10044-023-01204-5","relation":{},"ISSN":["1433-7541","1433-755X"],"issn-type":[{"value":"1433-7541","type":"print"},{"value":"1433-755X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,10,10]]},"assertion":[{"value":"15 May 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 September 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"10 October 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}