{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,19]],"date-time":"2026-05-19T15:59:54Z","timestamp":1779206394149,"version":"3.51.4"},"reference-count":39,"publisher":"Springer Science and Business Media LLC","issue":"11","license":[{"start":{"date-parts":[[2024,8,12]],"date-time":"2024-08-12T00:00:00Z","timestamp":1723420800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,8,12]],"date-time":"2024-08-12T00:00:00Z","timestamp":1723420800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62071189"],"award-info":[{"award-number":["62071189"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["SIViP"],"published-print":{"date-parts":[[2024,11]]},"DOI":"10.1007\/s11760-024-03482-w","type":"journal-article","created":{"date-parts":[[2024,8,12]],"date-time":"2024-08-12T02:02:22Z","timestamp":1723428142000},"page":"8403-8412","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["Compact bilinear pooling and multi-loss network for social media multimodal classification"],"prefix":"10.1007","volume":"18","author":[{"given":"Yushi","family":"Li","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xin","family":"Zheng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ming","family":"Zhu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jie","family":"Mei","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ziwen","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yunfei","family":"Tao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,8,12]]},"reference":[{"issue":"5","key":"3482_CR1","first-page":"5105","volume":"35","author":"X Xue","year":"2022","unstructured":"Xue, X., Zhang, C., Niu, Z., Wu, X.: Multi-level attention map network for multimodal sentiment analysis. IEEE Trans. Knowl. Data Eng. 35(5), 5105\u20135118 (2022)","journal-title":"IEEE Trans. Knowl. Data Eng."},{"key":"3482_CR2","doi-asserted-by":"crossref","unstructured":"Bansal, S., Gowda, K., Kumar, N.: A hybrid deep neural network for multimodal personalized hashtag recommendation. IEEE Trans. Comput. Soc. Syst. 10(5), 2439\u20132459 (2022)","DOI":"10.1109\/TCSS.2022.3184307"},{"issue":"15","key":"3482_CR3","doi-asserted-by":"publisher","first-page":"5528","DOI":"10.3390\/s22155528","volume":"22","author":"J Zhao","year":"2022","unstructured":"Zhao, J., Dong, W., Shi, L., Qiang, W., Kuang, Z., Xu, D., An, T.: Multimodal feature fusion method for unbalanced sample data in social network public opinion. Sensors 22(15), 5528 (2022)","journal-title":"Sensors"},{"key":"3482_CR4","doi-asserted-by":"crossref","unstructured":"Borth, D., Ji, R., Chen, T., Breuel, T., Chang, S.-F.: Large-scale visual sentiment ontology and detectors using adjective noun pairs. In: Proceedings of the 21st ACM International Conference on Multimedia, pp. 223\u2013232 (2013)","DOI":"10.1145\/2502081.2502282"},{"key":"3482_CR5","doi-asserted-by":"publisher","first-page":"479","DOI":"10.1007\/s00530-014-0407-8","volume":"22","author":"D Cao","year":"2016","unstructured":"Cao, D., Ji, R., Lin, D., Li, S.: A cross-media public sentiment analysis system for microblog. Multimed. Syst. 22, 479\u2013486 (2016)","journal-title":"Multimed. Syst."},{"key":"3482_CR6","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: Imagenet classification with deep convolutional neural networks. Adv. Neural Inf. Process. Syst. 25 (2012)"},{"key":"3482_CR7","unstructured":"Zhang, X., Zhao, J., LeCun, Y.: Character-level convolutional networks for text classification. Adv. Neural Inf. Process. Syst. 1, 649\u2013657 (2015)"},{"key":"3482_CR8","doi-asserted-by":"publisher","first-page":"132363","DOI":"10.1109\/ACCESS.2021.3114093","volume":"9","author":"L Ying","year":"2021","unstructured":"Ying, L., Yu, H., Wang, J., Ji, Y., Qian, S.: Multi-level multi-modal cross-attention network for fake news detection. IEEE Access 9, 132363\u2013132373 (2021)","journal-title":"IEEE Access"},{"issue":"6","key":"3482_CR9","doi-asserted-by":"publisher","first-page":"121","DOI":"10.1007\/s00138-021-01249-8","volume":"32","author":"SY Boulahia","year":"2021","unstructured":"Boulahia, S.Y., Amamra, A., Madi, M.R., Daikh, S.: Early, intermediate and late fusion strategies for robust deep learning-based multimodal action recognition. Mach. Vis. Appl. 32(6), 121 (2021)","journal-title":"Mach. Vis. Appl."},{"issue":"12","key":"3482_CR10","doi-asserted-by":"publisher","first-page":"2010","DOI":"10.3390\/sym12122010","volume":"12","author":"K Zhang","year":"2020","unstructured":"Zhang, K., Geng, Y., Zhao, J., Liu, J., Li, W.: Sentiment analysis of social media via multimodal feature fusion. Symmetry 12(12), 2010 (2020)","journal-title":"Symmetry"},{"key":"3482_CR11","doi-asserted-by":"crossref","unstructured":"Han, W., Chen, H., Poria, S.: Improving multimodal fusion with hierarchical mutual information maximization for multimodal sentiment analysis. arXiv preprint arXiv:2109.00412 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.723"},{"issue":"5","key":"3482_CR12","doi-asserted-by":"publisher","first-page":"829","DOI":"10.1162\/neco_a_01273","volume":"32","author":"J Gao","year":"2020","unstructured":"Gao, J., Li, P., Chen, Z., Zhang, J.: A survey on deep learning for multimodal data fusion. Neural Comput. 32(5), 829\u2013864 (2020)","journal-title":"Neural Comput."},{"key":"3482_CR13","doi-asserted-by":"crossref","unstructured":"Chua, W.W., Li, L., Goh, A.: Classifying multimodal data using transformers. In: Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, pp. 4780\u20134781 (2022)","DOI":"10.1145\/3534678.3542634"},{"issue":"1","key":"3482_CR14","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3284750","volume":"15","author":"Y Peng","year":"2019","unstructured":"Peng, Y., Qi, J.: Cm-gans: cross-modal generative adversarial networks for common representation learning. ACM Trans. Multimed. Comput. Commun. Appl. 15(1), 1\u201324 (2019)","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl."},{"key":"3482_CR15","unstructured":"Devlin, J., Chang, M.-W., Lee, K., Toutanova, K.: Bert: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"3482_CR16","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., et al.: An image is worth 16x16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"3482_CR17","doi-asserted-by":"crossref","unstructured":"Gandhi, A., Adhvaryu, K., Poria, S., Cambria, E., Hussain, A.: Multimodal sentiment analysis: a systematic review of history, datasets, multimodal fusion methods, applications, challenges and future directions. Inf. usion 91, 424\u2013444 (2023)","DOI":"10.1016\/j.inffus.2022.09.025"},{"key":"3482_CR18","doi-asserted-by":"publisher","first-page":"345","DOI":"10.1007\/s00530-010-0182-0","volume":"16","author":"PK Atrey","year":"2010","unstructured":"Atrey, P.K., Hossain, M.A., El Saddik, A., Kankanhalli, M.S.: Multimodal fusion for multimedia analysis: a survey. Multimed. Syst. 16, 345\u2013379 (2010)","journal-title":"Multimed. Syst."},{"key":"3482_CR19","doi-asserted-by":"crossref","unstructured":"Sun, H., Dhingra, B., Zaheer, M., Mazaitis, K., Salakhutdinov, R., Cohen, W.W.: Open domain question answering using early fusion of knowledge bases and text. arXiv preprint arXiv:1809.00782 (2018)","DOI":"10.18653\/v1\/D18-1455"},{"key":"3482_CR20","unstructured":"Xu, H., He, K., Sigal, L., Sclaroff, S., Saenko, K.: Text-to-clip video retrieval with early fusion and re-captioning. arXiv preprint arXiv:1804.05113 2(6), 7 (2018)"},{"key":"3482_CR21","unstructured":"Joze, H.R.V., Shaban, A., Iuzzolino, M.L., Koishida, K.: Mmtm: multimodal transfer module for cnn fusion. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13289\u201313299 (2020)"},{"issue":"6","key":"3482_CR22","doi-asserted-by":"publisher","first-page":"8597","DOI":"10.1007\/s11042-022-12122-9","volume":"81","author":"N Ding","year":"2022","unstructured":"Ding, N., Tian, S.-W., Yu, L.: A multimodal fusion method for sarcasm detection based on late fusion. Multimed. Tools Appl. 81(6), 8597\u20138616 (2022)","journal-title":"Multimed. Tools Appl."},{"key":"3482_CR23","doi-asserted-by":"crossref","unstructured":"Fukui, A., Park, D.H., Yang, D., Rohrbach, A., Darrell, T., Rohrbach, M.: Multimodal compact bilinear pooling for visual question answering and visual grounding. arXiv preprint arXiv:1606.01847 (2016)","DOI":"10.18653\/v1\/D16-1044"},{"issue":"3","key":"3482_CR24","doi-asserted-by":"publisher","first-page":"1093","DOI":"10.3390\/app12031093","volume":"12","author":"J Wang","year":"2022","unstructured":"Wang, J., Mao, H., Li, H.: Fmfn: fine-grained multimodal fusion networks for fake news detection. Appl. Sci. 12(3), 1093 (2022)","journal-title":"Appl. Sci."},{"key":"3482_CR25","doi-asserted-by":"publisher","first-page":"2507","DOI":"10.1007\/s11042-015-2646-x","volume":"75","author":"C Baecchi","year":"2016","unstructured":"Baecchi, C., Uricchio, T., Bertini, M., Del Bimbo, A.: A multimodal feature learning approach for sentiment analysis of social network multimedia. Multimed. Tools Appl. 75, 2507\u20132525 (2016)","journal-title":"Multimed. Tools Appl."},{"key":"3482_CR26","doi-asserted-by":"crossref","unstructured":"Xu, N., Mao, W.: Multisentinet: a deep semantic network for multimodal sentiment analysis. In: Proceedings of the 2017 ACM on Conference on Information and Knowledge Management, pp. 2399\u20132402 (2017)","DOI":"10.1145\/3132847.3133142"},{"key":"3482_CR27","doi-asserted-by":"publisher","first-page":"429","DOI":"10.1109\/TASLP.2019.2957872","volume":"28","author":"J Yu","year":"2019","unstructured":"Yu, J., Jiang, J., Xia, R.: Entity-sensitive attention and fusion network for entity-level multimodal sentiment classification. IEEE\/ACM Trans. Audio Speech Lang. Process. 28, 429\u2013439 (2019)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"3482_CR28","doi-asserted-by":"publisher","first-page":"4014","DOI":"10.1109\/TMM.2020.3035277","volume":"23","author":"X Yang","year":"2020","unstructured":"Yang, X., Feng, S., Wang, D., Zhang, Y.: Image-text multimodal emotion classification via multi-view attentional network. IEEE Trans. Multimed. 23, 4014\u20134026 (2020)","journal-title":"IEEE Trans. Multimed."},{"key":"3482_CR29","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/j.neucom.2022.09.140","volume":"514","author":"T-H Cheung","year":"2022","unstructured":"Cheung, T.-H., Lam, K.-M.: Crossmodal bipolar attention for multimodal classification on social media. Neurocomputing 514, 1\u201312 (2022)","journal-title":"Neurocomputing"},{"issue":"7","key":"3482_CR30","doi-asserted-by":"publisher","first-page":"2289","DOI":"10.1007\/s13042-022-01757-7","volume":"14","author":"Z Song","year":"2023","unstructured":"Song, Z., Xue, Y., Gu, D., Zhang, H., Ding, W.: Target-oriented multimodal sentiment classification by using topic model and gating mechanism. Int. J. Mach. Learn. Cybern. 14(7), 2289\u20132299 (2023)","journal-title":"Int. J. Mach. Learn. Cybern."},{"issue":"1","key":"3482_CR31","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3517139","volume":"19","author":"A Yadav","year":"2023","unstructured":"Yadav, A., Vishwakarma, D.K.: A deep multi-level attentive network for multimodal sentiment analysis. ACM Trans. Multimed. Comput. Commun. Appl. 19(1), 1\u201319 (2023)","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl."},{"key":"3482_CR32","doi-asserted-by":"publisher","first-page":"14742","DOI":"10.1109\/ACCESS.2023.3244390","volume":"11","author":"H-D Le","year":"2023","unstructured":"Le, H.-D., Lee, G.-S., Kim, S.-H., Kim, S., Yang, H.-J.: Multi-label multimodal emotion recognition with transformer-based fusion and emotion-level representation learning. IEEE Access 11, 14742\u201314751 (2023)","journal-title":"IEEE Access"},{"key":"3482_CR33","doi-asserted-by":"crossref","unstructured":"Charikar, M., Chen, K., Farach-Colton, M.: Finding frequent items in data streams. In: International Colloquium on Automata, Languages, and Programming, pp. 693\u2013703. Springer (2002)","DOI":"10.1007\/3-540-45465-9_59"},{"key":"3482_CR34","doi-asserted-by":"crossref","unstructured":"Zhang, Q., Fu, J., Liu, X., Huang, X.: Adaptive co-attention network for named entity recognition in tweets. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 32 (2018)","DOI":"10.1609\/aaai.v32i1.11962"},{"key":"3482_CR35","doi-asserted-by":"crossref","unstructured":"Wang, B., Lu, W.: Learning latent opinions for aspect-level sentiment classification. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 32 (2018)","DOI":"10.1609\/aaai.v32i1.12020"},{"key":"3482_CR36","doi-asserted-by":"crossref","unstructured":"Yu, J., Jiang, J.: Adapting bert for target-oriented multimodal sentiment classification. IJCAI (2019)","DOI":"10.24963\/ijcai.2019\/751"},{"key":"3482_CR37","doi-asserted-by":"crossref","unstructured":"Xu, N., Mao, W., Chen, G.: Multi-interactive memory network for aspect based multimodal sentiment analysis. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 33, pp. 371\u2013378 (2019)","DOI":"10.1609\/aaai.v33i01.3301371"},{"key":"3482_CR38","doi-asserted-by":"crossref","unstructured":"Li, Y., Jiang, S., et al.: Multimodal sentiment analysis with image-text correlation modal. In: 2023 IEEE International Conferences on Internet of Things (iThings) and IEEE Green Computing & Communications (GreenCom) and IEEE Cyber, Physical & Social Computing (CPSCom) and IEEE Smart Data (SmartData) and IEEE Congress on Cybermatics (Cybermatics), pp. 281\u2013286. IEEE (2023)","DOI":"10.1109\/iThings-GreenCom-CPSCom-SmartData-Cybermatics60724.2023.00067"},{"issue":"1","key":"3482_CR39","doi-asserted-by":"publisher","first-page":"6","DOI":"10.1007\/s13755-022-00197-5","volume":"11","author":"Z Li","year":"2023","unstructured":"Li, Z., An, Z., Cheng, W., Zhou, J., Zheng, F., Hu, B.: Mha: a multimodal hierarchical attention model for depression detection in social media. Health Inf. Sci. Syst. 11(1), 6 (2023)","journal-title":"Health Inf. Sci. Syst."}],"container-title":["Signal, Image and Video Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-024-03482-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11760-024-03482-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-024-03482-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,13]],"date-time":"2024-09-13T17:48:05Z","timestamp":1726249685000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11760-024-03482-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,12]]},"references-count":39,"journal-issue":{"issue":"11","published-print":{"date-parts":[[2024,11]]}},"alternative-id":["3482"],"URL":"https:\/\/doi.org\/10.1007\/s11760-024-03482-w","relation":{},"ISSN":["1863-1703","1863-1711"],"issn-type":[{"value":"1863-1703","type":"print"},{"value":"1863-1711","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,8,12]]},"assertion":[{"value":"20 June 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 July 2024","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 July 2024","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 August 2024","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that there are no conflict of interest statements.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}