{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,17]],"date-time":"2026-04-17T16:35:54Z","timestamp":1776443754689,"version":"3.51.2"},"reference-count":27,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2024,11,15]],"date-time":"2024-11-15T00:00:00Z","timestamp":1731628800000},"content-version":"vor","delay-in-days":319,"URL":"http:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Procedia Computer Science"],"published-print":{"date-parts":[[2024]]},"DOI":"10.1016\/j.procs.2024.11.082","type":"journal-article","created":{"date-parts":[[2024,12,6]],"date-time":"2024-12-06T12:16:02Z","timestamp":1733487362000},"page":"41-48","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":2,"special_numbering":"C","title":["Multimodal Sentiment Analysis based on Video and Audio Inputs"],"prefix":"10.1016","volume":"251","author":[{"given":"Antonio","family":"Fern\u00e1ndez","sequence":"first","affiliation":[]},{"given":"Suzan","family":"Awinat","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.procs.2024.11.082_bib0001","first-page":"9617","article-title":"TVLT: Textless vision-language transformer","volume":"35","author":"Tang","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.procs.2024.11.082_bib0002","doi-asserted-by":"crossref","first-page":"136843","DOI":"10.1109\/ACCESS.2020.3011977","article-title":"Enhanced video analytics for sentiment analysis based on fusing textual, auditory and visual information","volume":"8","author":"Al-Azani","year":"2020","journal-title":"IEEE Access"},{"issue":"1","key":"10.1016\/j.procs.2024.11.082_bib0003","doi-asserted-by":"crossref","first-page":"659","DOI":"10.1609\/icwsm.v10i1.14810","article-title":"Fusing audio, textual, and visual features for sentiment analysis of news videos","volume":"10","author":"Pereira","year":"2016","journal-title":"Proceedings of the International AAAI Conference on Web and Social Media"},{"key":"10.1016\/j.procs.2024.11.082_bib0004","doi-asserted-by":"crossref","first-page":"204","DOI":"10.1016\/j.inffus.2021.06.003","article-title":"Multimodal video sentiment analysis using deep learning approaches, a survey","volume":"76","author":"Abdu","year":"2021","journal-title":"Information Fusion"},{"key":"10.1016\/j.procs.2024.11.082_bib0005","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2022.109259","article-title":"TETFN: A text enhanced transformer fusion network for multimodal sentiment analysis","volume":"136","author":"Wang","year":"2023","journal-title":"Pattern Recognition"},{"key":"10.1016\/j.procs.2024.11.082_bib0006","series-title":"Proceedings of the 29th ACM International Conference on Multimedia","first-page":"4400","article-title":"Transformer-based feature reconstruction network for robust multimodal sentiment analysis","author":"Yuan","year":"2021"},{"issue":"4","key":"10.1016\/j.procs.2024.11.082_bib0007","doi-asserted-by":"crossref","first-page":"377","DOI":"10.1109\/TAFFC.2014.2336244","article-title":"CREMA-D: Crowd-sourced Emotional Multimodal Actors Dataset","volume":"5","author":"Cao","year":"2014","journal-title":"IEEE Trans Affect Comput"},{"key":"10.1016\/j.procs.2024.11.082_bib0008","unstructured":"Livingstone, S. R., & Russo, F. A. (2018). The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS)"},{"key":"10.1016\/j.procs.2024.11.082_bib0009","unstructured":"En PLoS ONE (1.0.0, Vol. 13, N\u00famero 5, p. e0196391). Zenodo. https:\/\/doi.org\/10.5281\/zenodo.1188976"},{"key":"10.1016\/j.procs.2024.11.082_bib0010","first-page":"12449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume":"33","author":"Baevski","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.procs.2024.11.082_bib0011","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision","first-page":"6836","article-title":"Vivit: A video vision transformer","author":"Arnab","year":"2021"},{"key":"10.1016\/j.procs.2024.11.082_bib0012","first-page":"32","article-title":"Pytorch: An imperative style, high-performance deep learning library","author":"Paszke","year":"2019","journal-title":"Advances in neural information processing systems"},{"key":"10.1016\/j.procs.2024.11.082_bib0013","doi-asserted-by":"crossref","unstructured":"McFee, B., Rafel, C., Liang, D., Ellis, D. P., McVicar, M., Battenberg, E., & Nieto, O. (2015, July). librosa: Audio and music signal analysis in python. In SciPy (pp. 18-24).","DOI":"10.25080\/Majora-7b98e3ed-003"},{"key":"10.1016\/j.procs.2024.11.082_bib0014","series-title":"Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations","first-page":"38","article-title":"Transformers: State-of-the-art natural language processing","author":"Wolf","year":"2020"},{"key":"10.1016\/j.procs.2024.11.082_bib0015","article-title":"Datasets: a community library for natural language processing","author":"Lhoest","year":"2021","journal-title":"arXiv"},{"key":"10.1016\/j.procs.2024.11.082_bib0016","first-page":"2825","article-title":"Scikit-learn: Machine learning in Python","volume":"12","author":"Pedregosa","year":"2011","journal-title":"the Journal of machine Learning research"},{"key":"10.1016\/j.procs.2024.11.082_bib0017","series-title":"12th USENIX symposium on operating systems design and implementation (OSDI 16)","first-page":"265","article-title":"TensorFlow: a system for Large-Scale machine learning","author":"Abadi","year":"2016"},{"key":"10.1016\/j.procs.2024.11.082_bib0018","unstructured":"Boers, M., Lain\u00e9, J., Reid, M., Fauske, V. T., Shambrook, B., Van der Wel, C., & Joy, D. (2017). PyAV Documentation. Available at: https:\/\/pyav.org\/docs\/stable\/index.html"},{"key":"10.1016\/j.procs.2024.11.082_bib0019","article-title":"tqdm: A fast, Extensible Progress Bar for Python and CLI (v4.66.2)","author":"Costa-Luis","year":"2024","journal-title":"Zenodo"},{"key":"10.1016\/j.procs.2024.11.082_bib0020","unstructured":"Zulko, E. (2019). MoviePy: Video editing with Python. Retrieved from https:\/\/zulko.github.io\/moviepy\/"},{"key":"10.1016\/j.procs.2024.11.082_bib0021","doi-asserted-by":"crossref","unstructured":"Hwang, J., Hira, M., Chen, C., Zhang, X., Ni, Z., Sun, G., Ma, P., Huang, R., Pratap, V., Zhang, Y., Kumar, A., Yu, C.-Y., Zhu, C., Liu, C., Kahn, J., Ravanelli, M., Sun, P., Watanabe, S., Shi, Y., Tao, Y., Scheibler, R., Cornell, S., Kim, S., Petridis, S. (2023). TorchAudio 2.1: Advancing speech recognition, self-supervised learning, and audio processing components for PyTorch. Retrieved from https:\/\/arxiv.org\/abs\/2310.17864","DOI":"10.1109\/ASRU57964.2023.10389648"},{"key":"10.1016\/j.procs.2024.11.082_bib0022","unstructured":"Yang, Y.-Y., Hira, M., Ni, Z., Chourdia, A., Astafurov, A., Chen, C., Yeh, C.-F., Puhrsch, C., Pollack, D., Genzel, D., Greenberg, D., Yang, E. Z., Lian, J., Mahadeokar, J., Hwang, J., Chen, J., Goldsborough, P., Roy, P., Narenthiran, S., Watanabe, S., Chintala, S., Quenneville-B\u00e9lair, V., Shi, Y. (2021). TorchAudio: Building Blocks for Audio and Speech Processing. Retrieved from https:\/\/arxiv.org\/abs\/2110.15018"},{"key":"10.1016\/j.procs.2024.11.082_bib0023","unstructured":"European Parliament. (2024). Artificial Intelligence Act European Parliament legislative resolution of 13 March 2024 on the proposal for a regulation of the European Parliament and of the Council on laying down harmonised rules on Artificial Intelligence (Artificial Intelligence Act) and amending certain Union Legislative Acts (COM(2021)0206 \u2013 C9-0146\/2021 \u20132021\/0106(COD)) Available at: https:\/\/www.europarl.europa.eu\/doceo\/document\/TA-9-2024-0138_EN.pdf"},{"key":"10.1016\/j.procs.2024.11.082_bib0024","unstructured":"Fernandez, A. (2nd of May, 2024). Audio Sentiment Analysis Model Training. Available at: https:\/\/www.kaggle.com\/antoniobfernandez\/audio-sentiment-analysis-model-training"},{"key":"10.1016\/j.procs.2024.11.082_bib0025","unstructured":"Fernandez, A. (2nd of May, 2024). Video Sentiment Analysis Model Training. Available at: https:\/\/www.kaggle.com\/code\/antoniobfernandez\/video-sentiment-analysis-model-training\/notebook"},{"key":"10.1016\/j.procs.2024.11.082_bib0026","unstructured":"Fernandez, A. (2nd of May, 2024). Multimodal Sentiment Analysis Test Framework V1. Available at: https:\/\/www.kaggle.com\/code\/antoniobfernandez\/multimodal-sentiment-analysis-test-framework-v1\/notebook"},{"key":"10.1016\/j.procs.2024.11.082_bib0027","unstructured":"Fernandez, A. (2nd of May, 2024). Multimodal Sentiment Analysis Test Framework V2. Available at: https:\/\/www.kaggle.com\/code\/antoniobfernandez\/multimodal-sentiment-analysis-test-framework-v2\/notebook"}],"container-title":["Procedia Computer Science"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1877050924033167?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1877050924033167?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2024,12,12]],"date-time":"2024-12-12T16:21:04Z","timestamp":1734020464000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S1877050924033167"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"references-count":27,"alternative-id":["S1877050924033167"],"URL":"https:\/\/doi.org\/10.1016\/j.procs.2024.11.082","relation":{},"ISSN":["1877-0509"],"issn-type":[{"value":"1877-0509","type":"print"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Multimodal Sentiment Analysis based on Video and Audio Inputs","name":"articletitle","label":"Article Title"},{"value":"Procedia Computer Science","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.procs.2024.11.082","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2024 The Author(s). Published by Elsevier B.V.","name":"copyright","label":"Copyright"}]}}