{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,4,11]],"date-time":"2025-04-11T04:09:02Z","timestamp":1744344542493,"version":"3.40.4"},"reference-count":42,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2025,4,10]],"date-time":"2025-04-10T00:00:00Z","timestamp":1744243200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,4,10]],"date-time":"2025-04-10T00:00:00Z","timestamp":1744243200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["SN COMPUT. SCI."],"DOI":"10.1007\/s42979-025-03906-5","type":"journal-article","created":{"date-parts":[[2025,4,10]],"date-time":"2025-04-10T11:40:47Z","timestamp":1744285247000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["RoLiVit: Feature Fusion Approach for Multimodal Sentiment Analysis Using Deep Learning"],"prefix":"10.1007","volume":"6","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6343-6205","authenticated-orcid":false,"given":"Namrata","family":"Shroff","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-3981-955X","authenticated-orcid":false,"given":"Shreya","family":"Patel","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1941-5308","authenticated-orcid":false,"given":"Hemani","family":"Shah","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,4,10]]},"reference":[{"key":"3906_CR1","doi-asserted-by":"publisher","first-page":"16","DOI":"10.1016\/j.cosrev.2017.10.002","volume":"27","author":"MV M\u00e4ntyl\u00e4","year":"2018","unstructured":"M\u00e4ntyl\u00e4 MV, Graziotin D, Kuutila M. The evolution of sentiment analysis\u2014a review of research topics, venues, and top cited papers. Comput Sci Rev. 2018;27:16\u201332.","journal-title":"Comput Sci Rev"},{"key":"3906_CR2","doi-asserted-by":"publisher","unstructured":"Kaur R, Kautish S. Multimodal sentiment analysis: a survey and comparison. In: I. Management Association, editor. Research anthology on implementing sentiment analysis across multiple disciplines. IGI Global Scientific Publishing; 2022. p. 1846\u201370. https:\/\/doi.org\/10.4018\/978-1-6684-6303-1.ch098.","DOI":"10.4018\/978-1-6684-6303-1.ch098"},{"key":"3906_CR3","first-page":"1","volume":"30","author":"A Vaswani","year":"2017","unstructured":"Vaswani A. Attention is all you need. Adv Neural Inf Process Syst. 2017;30:1.","journal-title":"Adv Neural Inf Process Syst"},{"key":"3906_CR4","unstructured":"O'shea, K., & Nash, R. (2015). An introduction to convolutional neural networks. arXiv preprint arXiv:1511.08458."},{"issue":"7","key":"3906_CR5","doi-asserted-by":"publisher","first-page":"1235","DOI":"10.1162\/neco_a_01199","volume":"31","author":"Y Yu","year":"2019","unstructured":"Yu Y, Si X, Hu C, Zhang J. A review of recurrent neural networks: LSTM cells and network architectures. Neural Comput. 2019;31(7):1235\u201370.","journal-title":"Neural Comput"},{"key":"3906_CR6","unstructured":"Liu, Y., Ott, M., Goyal, N., Du, J., Joshi, M., Chen, D., & Stoyanov, V. (2019). Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692."},{"key":"3906_CR7","doi-asserted-by":"publisher","unstructured":"Brian McFee, Alexandros Metsai, Matt McVicar, Stefan Balke, Carl Thom\u00e9, Colin Raffel, Frank Zalkow, Ayoub Malek, Dana, Kyungyun Lee, Oriol Nieto, Dan Ellis, Jack Mason, Eric Battenberg, Scott Seyfarth, Ryuichi Yamamoto, viktorandreevichmorozov, Keunwoo Choi, Josh Moore, Lorenz Nickel. (2022). librosa\/librosa: 0.9.2 (0.9.2). Zenodo. https:\/\/doi.org\/10.5281\/zenodo.6759581","DOI":"10.5281\/zenodo.6759581"},{"key":"3906_CR8","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., & Houlsby, N. (2020). An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929."},{"key":"3906_CR9","doi-asserted-by":"publisher","first-page":"107134","DOI":"10.1016\/j.knosys.2021.107134","volume":"226","author":"M Birjali","year":"2021","unstructured":"Birjali M, Kasri M, Beni-Hssane A. A comprehensive survey on sentiment analysis: Approaches, challenges and trends. Knowl-Based Syst. 2021;226:107134.","journal-title":"Knowl-Based Syst"},{"key":"3906_CR10","doi-asserted-by":"publisher","first-page":"424","DOI":"10.1016\/j.inffus.2022.09.025","volume":"91","author":"A Gandhi","year":"2023","unstructured":"Gandhi A, Adhvaryu K, Poria S, Cambria E, Hussain A. Multimodal sentiment analysis: A systematic review of history, datasets, multimodal fusion methods, applications, challenges and future directions. Information Fusion. 2023;91:424\u201344.","journal-title":"Information Fusion"},{"key":"3906_CR11","doi-asserted-by":"publisher","first-page":"306","DOI":"10.1016\/j.inffus.2023.02.028","volume":"95","author":"L Zhu","year":"2023","unstructured":"Zhu L, Zhu Z, Zhang C, Xu Y, Kong X. Multimodal sentiment analysis based on fusion methods: A survey. Inf Fusion. 2023;95:306\u201325.","journal-title":"Inf Fusion"},{"key":"3906_CR12","series-title":"Communications in computer and information science","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-59097-9_2","volume-title":"Advancements in smart computing and information security. ASCIS 2023","author":"S Patel","year":"2024","unstructured":"Patel S, Shroff N, Shah H (2024). Multimodal sentiment analysis using deep learning: a review. In: Rajagopal S, Popat K, Meva D, Bajeja S, editors. Advancements in smart computing and information security. ASCIS 2023. Communications in computer and information science, vol 2038. Springer, Cham. https:\/\/doi.org\/10.1007\/978-3-031-59097-9_2."},{"key":"3906_CR13","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1016\/j.imavis.2017.08.003","volume":"65","author":"M Soleymani","year":"2017","unstructured":"Soleymani M, Garcia D, Jou B, Schuller B, Chang SF, Pantic M. A survey of multimodal sentiment analysis. Image Vis Comput. 2017;65:3\u201314.","journal-title":"Image Vis Comput"},{"key":"3906_CR14","unstructured":"Devlin, J., Chang, M. W., Lee, K., & Toutanova, K. (2018). Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805."},{"key":"3906_CR15","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 770\u2013778).","DOI":"10.1109\/CVPR.2016.90"},{"issue":"10","key":"3906_CR16","first-page":"143","volume":"9","author":"S Tammina","year":"2019","unstructured":"Tammina S. Transfer learning using vgg-16 with deep convolutional neural network for classifying images. Int J Sci Res Publ (IJSRP). 2019;9(10):143\u201350.","journal-title":"Int J Sci Res Publ (IJSRP)"},{"key":"3906_CR17","doi-asserted-by":"crossref","unstructured":"Chollet, F. (2017). Xception: Deep learning with depthwise separable convolutions. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 1251\u20131258).","DOI":"10.1109\/CVPR.2017.195"},{"issue":"6","key":"3906_CR18","first-page":"1","volume":"1","author":"N Dave","year":"2013","unstructured":"Dave N. Feature extraction methods LPC, PLP and MFCC in speech recognition. Int J Adv Res Engand Technol. 2013;1(6):1\u20134.","journal-title":"Int J Adv Res Engand Technol"},{"key":"3906_CR19","doi-asserted-by":"crossref","unstructured":"Boukabous, M., & Azizi, M. (2022). Multimodal sentiment analysis using audio and text for crime detection. In 2022 2nd International Conference on Innovative Research in Applied Science, Engineering and Technology (IRASET) (pp. 1\u20135). IEEE.","DOI":"10.1109\/IRASET52964.2022.9738175"},{"key":"3906_CR20","doi-asserted-by":"crossref","unstructured":"Agarwal, A., Yadav, A., & Vishwakarma, D. K. (2019). Multimodal sentiment analysis via RNN variants. In 2019 IEEE International Conference on Big Data, Cloud Computing, Data Science & Engineering (BCD) (pp. 19\u201323). IEEE.","DOI":"10.1109\/BCD.2019.8885108"},{"key":"3906_CR21","doi-asserted-by":"crossref","unstructured":"Rahman, W., Hasan, M. K., Lee, S., Zadeh, A., Mao, C., Morency, L. P., & Hoque, E. (2020). Integrating multimodal information in large pretrained transformers. In Proceedings of the conference. Association for Computational Linguistics. Meeting (Vol. 2020, p. 2359). NIH Public Access.","DOI":"10.18653\/v1\/2020.acl-main.214"},{"key":"3906_CR22","unstructured":"Zadeh, A., Zellers, R., Pincus, E., & Morency, L. P. (2016). Mosi: multimodal corpus of sentiment intensity and subjectivity analysis in online opinion videos. arXiv preprint arXiv:1606.06259."},{"key":"3906_CR23","unstructured":"Zadeh, A. B., Liang, P. P., Poria, S., Cambria, E., & Morency, L. P. (2018). Multimodal language analysis in the wild: Cmu-mosei dataset and interpretable dynamic fusion graph. In Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) (pp. 2236\u20132246)."},{"key":"3906_CR24","doi-asserted-by":"crossref","unstructured":"Poria, S., Hazarika, D., Majumder, N., Naik, G., Cambria, E., & Mihalcea, R. (2018). Meld: A multimodal multi-party dataset for emotion recognition in conversations. arXiv preprint arXiv:1810.02508.","DOI":"10.18653\/v1\/P19-1050"},{"key":"3906_CR25","doi-asserted-by":"publisher","first-page":"335","DOI":"10.1007\/s10579-008-9076-6","volume":"42","author":"C Busso","year":"2008","unstructured":"Busso C, Bulut M, Lee CC, Kazemzadeh A, Mower E, Kim S, Narayanan SS. IEMOCAP: Interactive emotional dyadic motion capture database. Lang Resour Eval. 2008;42:335\u201359.","journal-title":"Lang Resour Eval"},{"key":"3906_CR26","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2023.101891","author":"Z Li","year":"2023","unstructured":"Li Z, Guo Q, Pan Y, Ding W, Yu J, Zhang Y, Xie Y. Multi-level correlation mining framework with self-supervised label generation for multimodal sentiment analysis. Inf Fusion. 2023. https:\/\/doi.org\/10.1016\/j.inffus.2023.101891.","journal-title":"Inf Fusion"},{"key":"3906_CR27","doi-asserted-by":"publisher","first-page":"109259","DOI":"10.1016\/j.patcog.2022.109259","volume":"136","author":"D Wang","year":"2023","unstructured":"Wang D, Guo X, Tian Y, Liu J, He L, Luo X. TETFN: A text enhanced transformer fusion network for multimodal sentiment analysis. Pattern Recogn. 2023;136:109259.","journal-title":"Pattern Recogn"},{"key":"3906_CR28","doi-asserted-by":"crossref","unstructured":"Zadeh, A., Chen, M., Poria, S., Cambria, E., & Morency, L. P. (2017). Tensor fusion network for multimodal sentiment analysis. arXiv preprint arXiv:1707.07250.","DOI":"10.18653\/v1\/D17-1115"},{"key":"3906_CR29","doi-asserted-by":"crossref","unstructured":"Liu, Z., Shen, Y., Lakshminarasimhan, V. B., Liang, P. P., Zadeh, A., & Morency, L. P. (2018). Efficient low-rank multimodal fusion with modality-specific factors. arXiv preprint arXiv:1806.00064.","DOI":"10.18653\/v1\/P18-1209"},{"issue":"1","key":"3906_CR30","first-page":"7216","volume":"33","author":"Y Wang","year":"2019","unstructured":"Wang Y, Shen Y, Liu Z, Liang PP, Zadeh A, Morency LP. Words can shift: Dynamically adjusting word representations using nonverbal behaviors. Proc AAAI Conf Artif Intell. 2019;33(1):7216\u201323.","journal-title":"Proc AAAI Conf Artif Intell"},{"key":"3906_CR31","doi-asserted-by":"crossref","unstructured":"Tsai, Y. H. H., Bai, S., Liang, P. P., Kolter, J. Z., Morency, L. P., & Salakhutdinov, R. (2019, July). Multimodal transformer for unaligned multimodal language sequences. In Proceedings of the conference. Association for Computational Linguistics. Meeting (Vol. 2019, p. 6558). NIH Public Access.","DOI":"10.18653\/v1\/P19-1656"},{"key":"3906_CR32","doi-asserted-by":"crossref","unstructured":"Hazarika, D., Zimmermann, R., & Poria, S. (2020, October). Misa: Modality-invariant and-specific representations for multimodal sentiment analysis. In Proceedings of the 28th ACM international conference on multimedia (pp. 1122\u20131131).","DOI":"10.1145\/3394171.3413678"},{"issue":"5","key":"3906_CR33","first-page":"8992","volume":"34","author":"Z Sun","year":"2020","unstructured":"Sun Z, Sarma P, Sethares W, Liang Y. Learning relationships between text, audio, and video via deep canonical correlation for multimodal language analysis. Proc AAAI Confer Artif Intell. 2020;34(5):8992\u20139.","journal-title":"Proc AAAI Confer Artif Intell"},{"issue":"12","key":"3906_CR34","first-page":"10790","volume":"35","author":"W Yu","year":"2021","unstructured":"Yu W, Xu H, Yuan Z, Wu J. Learning modality-specific representations with self-supervised multi-task learning for multimodal sentiment analysis. Proceed AAAI Confer Artif Intell. 2021;35(12):10790\u20137.","journal-title":"Proceed AAAI Confer Artif Intell"},{"key":"3906_CR35","doi-asserted-by":"crossref","unstructured":"Han, W., Chen, H., & Poria, S. (2021). Improving multimodal fusion with hierarchical mutual information maximization for multimodal sentiment analysis. arXiv preprint arXiv:2109.00412.","DOI":"10.18653\/v1\/2021.emnlp-main.723"},{"issue":"1","key":"3906_CR36","doi-asserted-by":"publisher","first-page":"289","DOI":"10.1007\/s12559-022-10073-9","volume":"15","author":"F Wang","year":"2023","unstructured":"Wang F, Tian S, Yu L, Liu J, Wang J, Li K, Wang Y. TEDT: transformer-based encoding\u2013decoding translation network for multimodal sentiment analysis. Cogn Comput. 2023;15(1):289\u2013303.","journal-title":"Cogn Comput"},{"key":"3906_CR37","doi-asserted-by":"crossref","unstructured":"Hu, G., Lin, T. E., Zhao, Y., Lu, G., Wu, Y., & Li, Y. (2022). Unimse: Towards unified multimodal sentiment analysis and emotion recognition. arXiv preprint arXiv:2211.11256.","DOI":"10.18653\/v1\/2022.emnlp-main.534"},{"key":"3906_CR38","doi-asserted-by":"publisher","first-page":"37","DOI":"10.1016\/j.inffus.2022.11.022","volume":"92","author":"K Kim","year":"2023","unstructured":"Kim K, Park S. AOBERT: All-modalities-in-One BERT for multimodal sentiment analysis. Inf Fusion. 2023;92:37\u201345.","journal-title":"Inf Fusion"},{"key":"3906_CR39","doi-asserted-by":"publisher","first-page":"101958","DOI":"10.1016\/j.inffus.2023.101958","volume":"100","author":"C Zhu","year":"2023","unstructured":"Zhu C, Chen M, Zhang S, Sun C, Liang H, Liu Y, Chen J. SKEAFN: sentiment knowledge enhanced attention fusion network for multimodal sentiment analysis. Inf Fusion. 2023;100:101958.","journal-title":"Inf Fusion"},{"key":"3906_CR40","unstructured":"Wu, Z., Gong, Z., Koo, J., & Hirschberg, J. (2023). Multi-Modality Multi-Loss Fusion Network. arXiv preprint arXiv:2308.00264."},{"key":"3906_CR41","doi-asserted-by":"publisher","DOI":"10.1007\/s00521-023-08366-7","author":"Y Zhao","year":"2023","unstructured":"Zhao Y, Mamat M, Aysa A, Ubul K. Multimodal sentiment system and method based on CRNN-SVM. Neural Comput Applic. 2023. https:\/\/doi.org\/10.1007\/s00521-023-08366-7.","journal-title":"Neural Comput Applic"},{"key":"3906_CR42","doi-asserted-by":"crossref","unstructured":"Lai, S., Hu, X., Li, Y., Ren, Z., Liu, Z., & Miao, D. (2023). Shared and Private Information Learning in Multimodal Sentiment Analysis with Deep Modal Alignment and Self-supervised Multi-Task Learning. arXiv preprint arXiv:2305.08473.","DOI":"10.2139\/ssrn.4564020"}],"container-title":["SN Computer Science"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42979-025-03906-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s42979-025-03906-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42979-025-03906-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,10]],"date-time":"2025-04-10T11:41:09Z","timestamp":1744285269000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s42979-025-03906-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,10]]},"references-count":42,"journal-issue":{"issue":"4","published-online":{"date-parts":[[2025,4]]}},"alternative-id":["3906"],"URL":"https:\/\/doi.org\/10.1007\/s42979-025-03906-5","relation":{},"ISSN":["2661-8907"],"issn-type":[{"value":"2661-8907","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,4,10]]},"assertion":[{"value":"13 May 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 March 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"10 April 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The author declare that they have no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"Not Applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical approval"}},{"value":"Not Applicable.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Informed consent"}}],"article-number":"373"}}