{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T05:54:29Z","timestamp":1743054869820,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":29,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819723898"},{"type":"electronic","value":"9789819723904"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-981-97-2390-4_29","type":"book-chapter","created":{"date-parts":[[2024,4,27]],"date-time":"2024-04-27T18:02:02Z","timestamp":1714240922000},"page":"424-438","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Multi-token Fusion Framework for\u00a0Multimodal Sentiment Analysis"],"prefix":"10.1007","author":[{"given":"Zhihui","family":"Long","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Huan","family":"Deng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhenguo","family":"Yang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wenyin","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,4,28]]},"reference":[{"key":"29_CR1","doi-asserted-by":"crossref","unstructured":"Cheng, M., et al.: Vista: vision and scene text aggregation for cross-modal retrieval. In: CoRR abs\/2203.16778 (2022)","DOI":"10.1109\/CVPR52688.2022.00512"},{"key":"29_CR2","doi-asserted-by":"crossref","unstructured":"Degottex, G., Kane, J., Drugman, T., Raitio, T., Scherer, S.: COVAREP - a collaborative voice analysis repository for speech technologies. In: ICASSP, pp. 960\u2013964 (2014)","DOI":"10.1109\/ICASSP.2014.6853739"},{"key":"29_CR3","doi-asserted-by":"crossref","unstructured":"Deng, H., Kang, P., Yang, Z., Hao, T., Li, Q., Liu, W.: Dense fusion network with multimodal residual for sentiment classification. In: ICME, pp. 1\u20136 (2021)","DOI":"10.1109\/ICME51207.2021.9428321"},{"key":"29_CR4","unstructured":"Devlin, J., Chang, M., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: NAACL-HLT, pp. 4171\u20134186 (2019)"},{"key":"29_CR5","doi-asserted-by":"crossref","unstructured":"Guo, M., et al.: Attention mechanisms in computer vision: a survey. Comput. Vis. Media 8(3), 331\u2013368 (2022)","DOI":"10.1007\/s41095-022-0271-y"},{"key":"29_CR6","doi-asserted-by":"crossref","unstructured":"Hazarika, D., Zimmermann, R., Poria, S.: MISA: modality-invariant and -specific representations for multimodal sentiment analysis. In: ACM Multimedia, pp. 1122\u20131131 (2020)","DOI":"10.1145\/3394171.3413678"},{"key":"29_CR7","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. In: ICLR (2015)"},{"issue":"14","key":"29_CR8","doi-asserted-by":"publisher","first-page":"19415","DOI":"10.1007\/s11042-021-11234-y","volume":"81","author":"H Lai","year":"2022","unstructured":"Lai, H., Yan, X.: Multimodal sentiment analysis with asymmetric window multi-attentions. Multimedia Tools Appl. 81(14), 19415\u201319428 (2022)","journal-title":"Multimedia Tools Appl."},{"key":"29_CR9","unstructured":"Lin, J., Yang, A., Zhang, Y., Liu, J., Zhou, J., Yang, H.: InterBERT: vision-and-language interaction for multi-modal pretraining. CoRR abs\/2003.13198 (2020)"},{"key":"29_CR10","unstructured":"Liu, H., Dai, Z., So, D.R., Le, Q.V.: Pay attention to MLPs. In: NeurIPS, pp. 9204\u20139215 (2021)"},{"key":"29_CR11","doi-asserted-by":"crossref","unstructured":"Liu, Z., Shen, Y., Lakshminarasimhan, V.B., Liang, P.P., Zadeh, A., Morency, L.: Efficient low-rank multimodal fusion with modality-specific factors. In: ACL, pp. 2247\u20132256 (2018)","DOI":"10.18653\/v1\/P18-1209"},{"key":"29_CR12","doi-asserted-by":"crossref","unstructured":"Mai, S., Hu, H., Xing, S.: Modality to modality translation: an adversarial representation learning and graph fusion network for multimodal fusion. In: AAAI, pp. 164\u2013172 (2020)","DOI":"10.1609\/aaai.v34i01.5347"},{"key":"29_CR13","doi-asserted-by":"crossref","unstructured":"Mittal, T., Bhattacharya, U., Chandra, R., Bera, A., Manocha, D.: M3ER: multiplicative multimodal emotion recognition using facial, textual, and speech cues. In: AAAI, pp. 1359\u20131367 (2020)","DOI":"10.1609\/aaai.v34i02.5492"},{"key":"29_CR14","unstructured":"Nagrani, A., Yang, S., Arnab, A., Jansen, A., Schmid, C., Sun, C.: Attention bottlenecks for multimodal fusion. In: NeurIPS, pp. 14200\u201314213 (2021)"},{"key":"29_CR15","doi-asserted-by":"crossref","unstructured":"Pham, H., Liang, P.P., Manzini, T., Morency, L., P\u00f3czos, B.: Found in translation: learning robust joint representations by cyclic translations between modalities. In: AAAI, pp. 6892\u20136899 (2019)","DOI":"10.1609\/aaai.v33i01.33016892"},{"key":"29_CR16","doi-asserted-by":"crossref","unstructured":"Sahay, S., Okur, E., Kumar, S.H., Nachman, L.: Low rank fusion based transformers for multimodal sequences. CoRR abs\/2007.02038 (2020)","DOI":"10.18653\/v1\/2020.challengehml-1.4"},{"key":"29_CR17","doi-asserted-by":"crossref","unstructured":"Tsai, Y.H., Bai, S., Liang, P.P., Kolter, J.Z., Morency, L., Salakhutdinov, R.: Multimodal transformer for unaligned multimodal language sequences. In: ACL, pp. 6558\u20136569 (2019)","DOI":"10.18653\/v1\/P19-1656"},{"key":"29_CR18","unstructured":"Tsai, Y.H., Liang, P.P., Zadeh, A., Morency, L., Salakhutdinov, R.: Learning factorized multimodal representations. In: ICLR (2019)"},{"key":"29_CR19","unstructured":"Vaswani, A., et al.: Attention is all you need. In: NeurIPS, pp. 5998\u20136008 (2017)"},{"key":"29_CR20","doi-asserted-by":"crossref","unstructured":"Wang, Y., Shen, Y., Liu, Z., Liang, P.P., Zadeh, A., Morency, L.: Words can shift: dynamically adjusting word representations using nonverbal behaviors. In: AAAI, pp. 7216\u20137223 (2019)","DOI":"10.1609\/aaai.v33i01.33017216"},{"issue":"1","key":"29_CR21","doi-asserted-by":"publisher","first-page":"2000688","DOI":"10.1080\/08839514.2021.2000688","volume":"36","author":"X Yan","year":"2022","unstructured":"Yan, X., Xue, H., Jiang, S., Liu, Z.: Multimodal sentiment analysis using multi-tensor fusion network with cross-modal modeling. Appl. Artif. Intell. 36(1), 2000688 (2022)","journal-title":"Appl. Artif. Intell."},{"key":"29_CR22","doi-asserted-by":"crossref","unstructured":"Yu, W., Xu, H., Yuan, Z., Wu, J.: Learning modality-specific representations with self-supervised multi-task learning for multimodal sentiment analysis. In: AAAI, pp. 10790\u201310797 (2021)","DOI":"10.1609\/aaai.v35i12.17289"},{"key":"29_CR23","doi-asserted-by":"crossref","unstructured":"Zadeh, A., Chen, M., Poria, S., Cambria, E., Morency, L.: Tensor fusion network for multimodal sentiment analysis. In: EMNLP, pp. 1103\u20131114 (2017)","DOI":"10.18653\/v1\/D17-1115"},{"key":"29_CR24","doi-asserted-by":"crossref","unstructured":"Zadeh, A., Liang, P.P., Mazumder, N., Poria, S., Cambria, E., Morency, L.: Memory fusion network for multi-view sequential learning. In: AAAI, pp. 5634\u20135641 (2018)","DOI":"10.1609\/aaai.v32i1.12021"},{"key":"29_CR25","unstructured":"Zadeh, A., Liang, P.P., Poria, S., Cambria, E., Morency, L.: Multimodal language analysis in the wild: CMU-MOSEI dataset and interpretable dynamic fusion graph. In: ACL, pp. 2236\u20132246 (2018)"},{"key":"29_CR26","doi-asserted-by":"crossref","unstructured":"Zadeh, A., Liang, P.P., Poria, S., Vij, P., Cambria, E., Morency, L.: Multi-attention recurrent network for human communication comprehension. In: AAAI, pp. 5642\u20135649 (2018)","DOI":"10.1609\/aaai.v32i1.12024"},{"issue":"6","key":"29_CR27","doi-asserted-by":"publisher","first-page":"82","DOI":"10.1109\/MIS.2016.94","volume":"31","author":"A Zadeh","year":"2016","unstructured":"Zadeh, A., Zellers, R., Pincus, E., Morency, L.: Multimodal sentiment intensity analysis in videos: facial gestures and verbal messages. IEEE Intell. Syst. 31(6), 82\u201388 (2016)","journal-title":"IEEE Intell. Syst."},{"key":"29_CR28","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2020.106639","volume":"212","author":"W Zhang","year":"2021","unstructured":"Zhang, W., Yu, J., Wang, Y., Wang, W.: Multimodal deep fusion for image question answering. Knowl. Based Syst. 212, 106639 (2021)","journal-title":"Knowl. Based Syst."},{"key":"29_CR29","doi-asserted-by":"crossref","unstructured":"Zhao, S., et al.: An end-to-end visual-audio attention network for emotion recognition in user-generated videos. In: AAAI, pp. 303\u2013311 (2020)","DOI":"10.1609\/aaai.v34i01.5364"}],"container-title":["Lecture Notes in Computer Science","Web and Big Data"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-97-2390-4_29","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,4,27]],"date-time":"2024-04-27T18:19:03Z","timestamp":1714241943000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-97-2390-4_29"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9789819723898","9789819723904"],"references-count":29,"URL":"https:\/\/doi.org\/10.1007\/978-981-97-2390-4_29","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"28 April 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"APWeb-WAIM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Asia-Pacific Web (APWeb) and Web-Age Information Management (WAIM) Joint International Conference on Web and Big Data","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Wuhan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"6 October 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 October 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"apwebwaim2023","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/www.apweb-waim2023.com\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}