{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,10]],"date-time":"2026-06-10T08:11:18Z","timestamp":1781079078399,"version":"3.54.1"},"reference-count":51,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2022,1,4]],"date-time":"2022-01-04T00:00:00Z","timestamp":1641254400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,4]],"date-time":"2022-01-04T00:00:00Z","timestamp":1641254400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100001809","name":"national natural science foundation of china","doi-asserted-by":"publisher","award":["61772321"],"award-info":[{"award-number":["61772321"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100007129","name":"natural science foundation of shandong","doi-asserted-by":"crossref","award":["ZR202011020044"],"award-info":[{"award-number":["ZR202011020044"]}],"id":[{"id":"10.13039\/501100007129","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100001809","name":"national natural science foundation of china","doi-asserted-by":"publisher","award":["81973981"],"award-info":[{"award-number":["81973981"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Key Project of Research and Development in Shandong Province","award":["2019RKB14090"],"award-info":[{"award-number":["2019RKB14090"]}]},{"name":"Project of Traditional Chinese Medicine and Technology Development Plan Program in Shandong province","award":["2019-0018"],"award-info":[{"award-number":["2019-0018"]}]},{"name":"Shandong Postgraduate Education Quality Improvement Plan","award":["SDYKC19147"],"award-info":[{"award-number":["SDYKC19147"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Neural Process Lett"],"published-print":{"date-parts":[[2022,6]]},"DOI":"10.1007\/s11063-021-10713-5","type":"journal-article","created":{"date-parts":[[2022,1,4]],"date-time":"2022-01-04T08:03:19Z","timestamp":1641283399000},"page":"1943-1960","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":31,"title":["Transformer-Based Interactive Multi-Modal Attention Network for Video Sentiment Detection"],"prefix":"10.1007","volume":"54","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2010-278X","authenticated-orcid":false,"given":"Xuqiang","family":"Zhuang","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Fangai","family":"Liu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jian","family":"Hou","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jianhua","family":"Hao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiaohong","family":"Cai","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2022,1,4]]},"reference":[{"key":"10713_CR1","doi-asserted-by":"crossref","unstructured":"Poria S, Cambria E, Hazarika D, Majumder N, Zadeh A, Morency LP (2017) Context-dependent sentiment analysis in user-generated videos. In: Proceedings of the 55th annual meeting of the association for computational linguistics (volume 1: Long papers), pp 873\u2013883","DOI":"10.18653\/v1\/P17-1081"},{"key":"10713_CR2","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1016\/j.imavis.2017.08.003","volume":"65","author":"M Soleymani","year":"2017","unstructured":"Soleymani M, Garcia D, Jou B, Schuller B, Chang SF, Pantic M (2017) A survey of multimodal sentiment analysis. Image Vis Comput 65:3\u201314","journal-title":"Image Vis Comput"},{"issue":"4","key":"10713_CR3","doi-asserted-by":"publisher","first-page":"e1253","DOI":"10.1002\/widm.1253","volume":"8","author":"L Zhang","year":"2018","unstructured":"Zhang L, Wang S, Liu B (2018) Deep learning for sentiment analysis: a survey. Wiley Interdiscip Rev Data Min Knowl Discov 8(4):e1253","journal-title":"Wiley Interdiscip Rev Data Min Knowl Discov"},{"issue":"5","key":"10713_CR4","doi-asserted-by":"publisher","first-page":"2019","DOI":"10.1109\/TIP.2014.2311377","volume":"23","author":"J Yu","year":"2014","unstructured":"Yu J, Rui Y, Tao D (2014) Click prediction for web image reranking using multimodal sparse coding. IEEE Trans Image Process 23(5):2019\u20132032","journal-title":"IEEE Trans Image Process"},{"key":"10713_CR5","doi-asserted-by":"publisher","unstructured":"Yu J, Tan M, Zhang H, Tao D, Rui Y (2019) Hierarchical deep click feature prediction for fine-grained image recognition. IEEE Trans Pattern Anal Mach Intell. https:\/\/doi.org\/10.1109\/TPAMI.2019.2932058","DOI":"10.1109\/TPAMI.2019.2932058"},{"key":"10713_CR6","unstructured":"Dumpala SH, Sheikh I, Chakraborty R, Kopparapu SK (2019) Audio-visual fusion for sentiment classification using cross-modal autoencoder. In: 32nd conference on neural information processing systems (NIPS 2018), pp 1\u20134"},{"key":"10713_CR7","doi-asserted-by":"crossref","unstructured":"Dumpala SH, Sheikh I, Chakraborty R, Kopparapu SK (2018) Sentiment classification on erroneous ASR transcripts: a multi view learning approach. In: 2018 IEEE Spoken language technology workshop (SLT). IEEE, pp 807\u2013814","DOI":"10.1109\/SLT.2018.8639665"},{"key":"10713_CR8","doi-asserted-by":"crossref","unstructured":"Sheikh I, Dumpala SH, Chakraborty R, Kopparapu SK (2018) Sentiment analysis using imperfect views from spoken language and acoustic modalities. In: Proceedings of grand challenge and workshop on human multimodal language (Challenge-HML), pp 35\u201339","DOI":"10.18653\/v1\/W18-3305"},{"issue":"4","key":"10713_CR9","first-page":"372","volume":"9","author":"A Kumar","year":"2012","unstructured":"Kumar A, Sebastian TM (2012) Sentiment analysis on twitter. Int J Comput Sci Issues (IJCSI) 9(4):372","journal-title":"Int J Comput Sci Issues (IJCSI)"},{"key":"10713_CR10","doi-asserted-by":"crossref","unstructured":"Sun Z, Sarma PK, Sethares W, Bucy EP (2019) Multi-modal sentiment analysis using deep canonical correlation analysis. arXiv:1907.08696","DOI":"10.21437\/Interspeech.2019-2482"},{"issue":"18","key":"10713_CR11","doi-asserted-by":"publisher","first-page":"5276","DOI":"10.3390\/s20185276","volume":"20","author":"MA Mohammed Almansor","year":"2020","unstructured":"Mohammed Almansor MA, Zhang C, Khan W, Hussain A, Alhusaini N (2020) Cross lingual sentiment analysis: a clustering-based bee colony instance selection and target-based feature weighting approach. Sensors 20(18):5276","journal-title":"Sensors"},{"key":"10713_CR12","doi-asserted-by":"crossref","unstructured":"Chen M, Wang S, Liang PP, Baltru\u0161aitis T, Zadeh A, Morency LP (2017) Multimodal sentiment analysis with word-level fusion and reinforcement learning. In: Proceedings of the 19th ACM international conference on multimodal interaction, pp 163\u2013171","DOI":"10.1145\/3136755.3136801"},{"issue":"2","key":"10713_CR13","first-page":"38","volume":"10","author":"R Kaur","year":"2019","unstructured":"Kaur R, Kautish S (2019) Multimodal sentiment analysis: a survey and comparison. Int J Serv Sci Manag Eng Technol (IJSSMET) 10(2):38\u201358","journal-title":"Int J Serv Sci Manag Eng Technol (IJSSMET)"},{"key":"10713_CR14","doi-asserted-by":"crossref","unstructured":"Poria S, Chaturvedi I, Cambria E, Hussain A (2016) Convolutional MKL based multimodal emotion recognition and sentiment analysis. In: 2016 IEEE 16th international conference on data mining (ICDM), IEEE, pp 439\u2013448","DOI":"10.1109\/ICDM.2016.0055"},{"issue":"6","key":"10713_CR15","doi-asserted-by":"publisher","first-page":"17","DOI":"10.1109\/MIS.2018.2882362","volume":"33","author":"S Poria","year":"2018","unstructured":"Poria S, Majumder N, Hazarika D, Cambria E, Gelbukh A, Hussain A (2018) Multimodal sentiment analysis: addressing key issues and setting up the baselines. IEEE Intell Syst 33(6):17\u201325","journal-title":"IEEE Intell Syst"},{"key":"10713_CR16","doi-asserted-by":"crossref","unstructured":"Agarwal A, Yadav A, Vishwakarma DK (2019) Multimodal sentiment analysis via RNN variants. In: 2019 IEEE international conference on big data, cloud computing, data science & engineering (BCD). IEEE, pp 19\u201323","DOI":"10.1109\/BCD.2019.8885108"},{"key":"10713_CR17","doi-asserted-by":"publisher","first-page":"124","DOI":"10.1016\/j.knosys.2018.07.041","volume":"161","author":"N Majumder","year":"2018","unstructured":"Majumder N, Hazarika D, Gelbukh A, Cambria E, Poria S (2018) Multimodal sentiment analysis using hierarchical fusion with context modeling. Knowl-Based Syst 161:124\u2013133","journal-title":"Knowl-Based Syst"},{"key":"10713_CR18","doi-asserted-by":"crossref","unstructured":"Xi C, Lu G, Yan J (2020) Multimodal sentiment analysis based on multi-head attention mechanism. In: Proceedings of the 4th international conference on machine learning and soft computing, pp 34\u201339","DOI":"10.1145\/3380688.3380693"},{"key":"10713_CR19","first-page":"2514","volume":"2020","author":"Z Wang","year":"2020","unstructured":"Wang Z, Wan Z, Wan X (2020) Transmodality: an end2end fusion method with transformer for multimodal sentiment analysis. Proc Web Conf 2020:2514\u20132520","journal-title":"Proc Web Conf"},{"key":"10713_CR20","doi-asserted-by":"crossref","unstructured":"Wang H, Meghawat A, Morency LP, Xing EP (2017) Select-additive learning: improving generalization in multimodal sentiment analysis. In: 2017 IEEE international conference on multimedia and expo (ICME). IEEE, pp 949\u2013954","DOI":"10.1109\/ICME.2017.8019301"},{"key":"10713_CR21","doi-asserted-by":"crossref","unstructured":"Cambria E, Hazarika D, Poria S, Hussain A, Subramanyam R (2017) Benchmarking multimodal sentiment analysis. In: International conference on computational linguistics and intelligent text processing. Springer, pp 166\u2013179","DOI":"10.1007\/978-3-319-77116-8_13"},{"issue":"11","key":"10713_CR22","first-page":"1233","volume":"3","author":"S Fulse","year":"2014","unstructured":"Fulse S, Sugandhi R, Mahajan A (2014) A survey on multimodal sentiment analysis. Int J Eng Res Technol 3(11):1233\u20131238","journal-title":"Int J Eng Res Technol"},{"key":"10713_CR23","doi-asserted-by":"crossref","unstructured":"Poria S, Cambria E, Hazarika D, Mazumder N, Zadeh A, Morency LP (2017) Multi-level multiple attentions for contextual multimodal sentiment analysis. In: 2017 IEEE international conference on data mining (ICDM). IEEE, pp 1033\u20131038","DOI":"10.1109\/ICDM.2017.134"},{"key":"10713_CR24","doi-asserted-by":"crossref","unstructured":"Ranganathan H, Chakraborty S, Panchanathan S (2016) Multimodal emotion recognition using deep learning architectures. In: 2016 IEEE Winter conference on applications of computer vision (WACV). IEEE, pp 1\u20139","DOI":"10.1109\/WACV.2016.7477679"},{"key":"10713_CR25","doi-asserted-by":"crossref","unstructured":"Poria S, Cambria E, Gelbukh A (2015) Deep convolutional neural network textual features and multiple kernel learning for utterance-level multimodal sentiment analysis. In: Proceedings of the 2015 conference on empirical methods in natural language processing, pp 2539\u20132544","DOI":"10.18653\/v1\/D15-1303"},{"key":"10713_CR26","doi-asserted-by":"crossref","unstructured":"Luo Z, Xu H, Chen F (2019) Audio sentiment analysis by heterogeneous signal features learned from utterance-based parallel neural network. In: AffCon@AAAI","DOI":"10.29007\/7mhj"},{"key":"10713_CR27","doi-asserted-by":"crossref","unstructured":"Huddar MG, Sannakki SS, Rajpurohit VS (2018) An ensemble approach to utterance level multimodal sentiment analysis. In: 2018 International conference on computational techniques, electronics and mechanical systems (CTEMS). IEEE, pp 145\u2013150","DOI":"10.1109\/CTEMS.2018.8769162"},{"key":"10713_CR28","unstructured":"Deng D, Zhou Y, Pi J, Shi BE (2018) Multimodal utterance-level affect analysis using visual, audio and text features. arXiv:1805.00625"},{"key":"10713_CR29","unstructured":"Devlin J, Chang MW, Lee K, Toutanova K (2018) Bert: pre-training of deep bidirectional transformers for language understanding. arXiv:1810.04805"},{"key":"10713_CR30","doi-asserted-by":"crossref","unstructured":"Wang M, Cao D, Li L, Li S, Ji R (2014) Microblog sentiment analysis based on cross-media bag-of-words model. In: Proceedings of international conference on internet multimedia computing and service, pp 76\u201380","DOI":"10.1145\/2632856.2632912"},{"issue":"4","key":"10713_CR31","doi-asserted-by":"publisher","first-page":"479","DOI":"10.1007\/s00530-014-0407-8","volume":"22","author":"D Cao","year":"2016","unstructured":"Cao D, Ji R, Lin D, Li S (2016) A cross-media public sentiment analysis system for microblog. Multimed Syst 22(4):479\u2013486","journal-title":"Multimed Syst"},{"key":"10713_CR32","doi-asserted-by":"crossref","unstructured":"You Q, Luo J, Jin H, Yang J (2016) Cross-modality consistent regression for joint visual-textual sentiment analysis of social multimedia. In: Proceedings of the Ninth ACM international conference on Web search and data mining, pp 13\u201322","DOI":"10.1145\/2835776.2835779"},{"key":"10713_CR33","doi-asserted-by":"crossref","unstructured":"You Q, Cao L, Jin H, Luo J (2016) Robust visual-textual sentiment analysis: when attention meets tree-structured recursive neural networks. In: Proceedings of the 24th ACM international conference on multimedia, pp 1008\u20131017","DOI":"10.1145\/2964284.2964288"},{"key":"10713_CR34","doi-asserted-by":"crossref","unstructured":"Zadeh A, Chen M, Poria S, Cambria E, Morency LP (2017) Tensor fusion network for multimodal sentiment analysis. arXiv:1707.07250","DOI":"10.18653\/v1\/D17-1115"},{"key":"10713_CR35","doi-asserted-by":"crossref","unstructured":"You Q, Luo J, Jin H, Yang J (2015) Joint visual-textual sentiment analysis with deep neural networks. In: Proceedings of the 23rd ACM international conference on Multimedia, pp 1071\u20131074","DOI":"10.1145\/2733373.2806284"},{"key":"10713_CR36","doi-asserted-by":"crossref","unstructured":"Zhu X, Cao B, Xu S, Liu B, Cao J (2019) Joint visual-textual sentiment analysis based on cross-modality attention mechanism. In: International conference on multimedia modeling. Springer, pp 264\u2013276","DOI":"10.1007\/978-3-030-05710-7_22"},{"key":"10713_CR37","doi-asserted-by":"crossref","unstructured":"Zadeh A, Liang PP, Mazumder N, Poria S, Cambria E, Morency LP (2018) Memory fusion network for multi-view sequential learning. In: Proceedings of the AAAI conference on artificial intelligence, vol\u00a032","DOI":"10.1609\/aaai.v32i1.12021"},{"key":"10713_CR38","doi-asserted-by":"crossref","unstructured":"Zadeh AB, Liang PP, Poria S, Cambria E, Morency LP (2018) Multimodal language analysis in the wild: Cmu-mosei dataset and interpretable dynamic fusion graph. In: Proceedings of the 56th annual meeting of the association for computational linguistics (Volume 1: Long Papers), pp 2236\u20132246","DOI":"10.18653\/v1\/P18-1208"},{"key":"10713_CR39","doi-asserted-by":"crossref","unstructured":"Hutto C, Gilbert E (2014) Vader: a parsimonious rule-based model for sentiment analysis of social media text. In: Proceedings of the international AAAI conference on web and social media, vol\u00a08","DOI":"10.1609\/icwsm.v8i1.14550"},{"key":"10713_CR40","doi-asserted-by":"crossref","unstructured":"O\u2019Connor B, Balasubramanyan R, Routledge BR, Smith NA (2010) From tweets to polls: linking text sentiment to public opinion time series. In: Fourth international AAAI conference on weblogs and social media","DOI":"10.1609\/icwsm.v4i1.14031"},{"key":"10713_CR41","doi-asserted-by":"crossref","unstructured":"Borth D, Chen T, Ji R, Chang SF (2013) Sentibank: large-scale ontology and classifiers for detecting sentiment and emotions in visual content. In: Proceedings of the 21st ACM international conference on Multimedia, pp 459\u2013460","DOI":"10.1145\/2502081.2502268"},{"key":"10713_CR42","doi-asserted-by":"crossref","unstructured":"Siersdorfer S, Minack E, Deng F, Hare J (2010) Analyzing and predicting sentiment of images on the social web. In: Proceedings of the 18th ACM international conference on Multimedia, pp 715\u2013718","DOI":"10.1145\/1873951.1874060"},{"key":"10713_CR43","doi-asserted-by":"crossref","unstructured":"Pennington J, Socher R, Manning CD (2014) Glove: global vectors for word representation. In: Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP), pp 1532\u20131543","DOI":"10.3115\/v1\/D14-1162"},{"key":"10713_CR44","unstructured":"Simonyan K, Zisserman A (2014) Very deep convolutional networks for large-scale image recognition. arXiv:1409.1556"},{"key":"10713_CR45","doi-asserted-by":"crossref","unstructured":"Eyben F, W\u00f6llmer M, Schuller B (2010) Opensmile: the Munich versatile and fast open-source audio feature extractor. In: Proceedings of the 18th ACM international conference on multimedia, pp 1459\u20131462","DOI":"10.1145\/1873951.1874246"},{"key":"10713_CR46","unstructured":"Rakhlin A (2016) Convolutional neural networks for sentence classification. GitHub"},{"key":"10713_CR47","unstructured":"P\u00e9rez-Rosas V, Mihalcea R, Morency LP (2013) Utterance-level multimodal sentiment analysis. In: Proceedings of the 51st annual meeting of the association for computational linguistics (Volume 1: Long Papers), pp 973\u2013982"},{"issue":"6","key":"10713_CR48","doi-asserted-by":"publisher","first-page":"82","DOI":"10.1109\/MIS.2016.94","volume":"31","author":"A Zadeh","year":"2016","unstructured":"Zadeh A, Zellers R, Pincus E, Morency LP (2016) Multimodal sentiment intensity analysis in videos: facial gestures and verbal messages. IEEE Intell Syst 31(6):82\u201388","journal-title":"IEEE Intell Syst"},{"key":"10713_CR49","doi-asserted-by":"crossref","unstructured":"Nojavanasghari B, Gopinath D, Koushik J, Baltru\u0161aitis T, Morency LP (2016) Deep multimodal fusion for persuasiveness prediction. In: Proceedings of the 18th ACM international conference on multimodal interaction, pp 284\u2013288","DOI":"10.1145\/2993148.2993176"},{"key":"10713_CR50","doi-asserted-by":"crossref","unstructured":"Rajagopalan SS, Morency LP, Baltrusaitis T, Goecke R (2016) Extending long short-term memory for multi-view structured learning. In: European conference on computer vision. Springer, pp 338\u2013353","DOI":"10.1007\/978-3-319-46478-7_21"},{"key":"10713_CR51","doi-asserted-by":"crossref","unstructured":"Blanchard N, Moreira D, Bharati A, Scheirer WJ (2018) Getting the subtext without the text: Scalable multimodal sentiment classification from visual and acoustic modalities. arXiv:1807.01122","DOI":"10.18653\/v1\/W18-3301"}],"container-title":["Neural Processing Letters"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11063-021-10713-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11063-021-10713-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11063-021-10713-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,1,21]],"date-time":"2023-01-21T18:52:01Z","timestamp":1674327121000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11063-021-10713-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,1,4]]},"references-count":51,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2022,6]]}},"alternative-id":["10713"],"URL":"https:\/\/doi.org\/10.1007\/s11063-021-10713-5","relation":{},"ISSN":["1370-4621","1573-773X"],"issn-type":[{"value":"1370-4621","type":"print"},{"value":"1573-773X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,1,4]]},"assertion":[{"value":"8 December 2021","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 January 2022","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declaration"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}