{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T02:10:09Z","timestamp":1755915009737,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":35,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"Malaysia Ministry of Higher Education (MOHE)","award":["FRGS\/1\/2022\/ICT02\/HWUM\/02\/1"],"award-info":[{"award-number":["FRGS\/1\/2022\/ICT02\/HWUM\/02\/1"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3689092.3689409","type":"proceedings-article","created":{"date-parts":[[2024,10,23]],"date-time":"2024-10-23T18:33:17Z","timestamp":1729708397000},"page":"15-23","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Are You Paying Attention? Multimodal Linear Attention Transformers for Affect Prediction in Video Conversations"],"prefix":"10.1145","author":[{"given":"Jia Qing","family":"Poh","sequence":"first","affiliation":[{"name":"School of Mathematical and Computer Sciences, Heriot-Watt University Malaysia, Putrajaya, Malaysia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"John","family":"See","sequence":"additional","affiliation":[{"name":"School of Mathematical and Computer Sciences, Heriot-Watt University Malaysia, Putrajaya, Malaysia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1467-1115","authenticated-orcid":false,"given":"Neamat","family":"El Gayar","sequence":"additional","affiliation":[{"name":"School of Mathematical and Computer Sciences, Heriot-Watt University Dubai, Dubai, United Arab Emirates"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4517-0391","authenticated-orcid":false,"given":"Lai-Kuan","family":"Wong","sequence":"additional","affiliation":[{"name":"Faculty of Computing and Informatics, Multimedia University, Cyberjaya, Malaysia"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3607865.3613179"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1111\/1467-8721.00003"},{"key":"e_1_3_2_1_4_1","volume-title":"IEMOCAP: Interactive emotional dyadic motion capture database. Language resources and evaluation","author":"Busso Carlos","year":"2008","unstructured":"Carlos Busso, Murtaza Bulut, Chi-Chun Lee, Abe Kazemzadeh, Emily Mower, Samuel Kim, Jeannette N Chang, Sungbok Lee, and Shrikanth S Narayanan. 2008. IEMOCAP: Interactive emotional dyadic motion capture database. Language resources and evaluation, Vol. 42 (2008), 335--359."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/5.664274"},{"key":"e_1_3_2_1_6_1","unstructured":"Krzysztof Choromanski Valerii Likhosherstov David Dohan Xingyou Song Andreea Gane Tamas Sarlos Peter Hawkins Jared Davis Afroz Mohiuddin Lukasz Kaiser et al. 2020. Rethinking attention with performers. arXiv preprint arXiv:2009.14794 (2020)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2011.6130508"},{"key":"e_1_3_2_1_8_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.3390\/s20030592"},{"key":"e_1_3_2_1_10_1","volume-title":"Ast: Audio spectrogram transformer. arXiv preprint arXiv:2104.01778","author":"Gong Yuan","year":"2021","unstructured":"Yuan Gong, Yu-An Chung, and James Glass. 2021. Ast: Audio spectrogram transformer. arXiv preprint arXiv:2104.01778 (2021)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3120633"},{"key":"e_1_3_2_1_12_1","first-page":"5036","article-title":"Conformer","volume":"2020","author":"Gulati Anmol","year":"2020","unstructured":"Anmol Gulati, James Qin, Chung-Cheng Chiu, Niki Parmar, Yu Zhang, Jiahui Yu, Wei Han, Shibo Wang, Zhengdong Zhang, Yonghui Wu, and Ruoming Pang. 2020. Conformer: Convolution-augmented Transformer for Speech Recognition. In Proc. Interspeech 2020. 5036--5040.","journal-title":"Convolution-augmented Transformer for Speech Recognition. In Proc. Interspeech"},{"key":"e_1_3_2_1_13_1","volume-title":"Vision Xformers: Efficient attention for image classification. arXiv preprint arXiv:2107.02239","author":"Jeevan Pranav","year":"2021","unstructured":"Pranav Jeevan and Amit Sethi. 2021. Vision Xformers: Efficient attention for image classification. arXiv preprint arXiv:2107.02239 (2021)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413620"},{"key":"e_1_3_2_1_15_1","volume-title":"Aff-wild2: Extending the aff-wild database for affect recognition. arXiv preprint arXiv:1811.07770","author":"Kollias Dimitrios","year":"2018","unstructured":"Dimitrios Kollias and Stefanos Zafeiriou. 2018. Aff-wild2: Extending the aff-wild database for affect recognition. arXiv preprint arXiv:1811.07770 (2018)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3030497"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3014737"},{"key":"e_1_3_2_1_18_1","volume-title":"Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557","author":"Li Liunian Harold","year":"2019","unstructured":"Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, and Kai-Wei Chang. 2019. Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557 (2019)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3049898"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1209"},{"key":"e_1_3_2_1_21_1","volume-title":"Proc. Workshop Detection Classification Acoust. Scenes Events (DCASE). 100--104","author":"Miyazaki Koichi","year":"2020","unstructured":"Koichi Miyazaki, Tatsuya Komatsu, Tomoki Hayashi, Shinji Watanabe, Tomoki Toda, and Kazuya Takeda. 2020. Convolution-augmented transformer for semi-supervised sound event detection. In Proc. Workshop Detection Classification Acoust. Scenes Events (DCASE). 100--104."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_39"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41597-020-00630-y"},{"volume-title":"Theories of emotion","author":"Plutchik Robert","key":"e_1_3_2_1_24_1","unstructured":"Robert Plutchik and Henry Kellerman. 2013. Theories of emotion. Vol. 1. Academic Press."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i1.25078"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00756"},{"key":"e_1_3_2_1_27_1","volume-title":"Advances in Neural Information Processing Systems","volume":"30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in Neural Information Processing Systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_28_1","volume-title":"Linformer: Self-attention with linear complexity. arXiv preprint arXiv:2006.04768","author":"Wang Sinong","year":"2020","unstructured":"Sinong Wang, Belinda Z Li, Madian Khabsa, Han Fang, and Hao Ma. 2020. Linformer: Self-attention with linear complexity. arXiv preprint arXiv:2006.04768 (2020)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.5555\/2449288.2449295"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.144"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i16.17664"},{"key":"e_1_3_2_1_32_1","volume-title":"Multimodal learning with transformers: A survey","author":"Xu Peng","year":"2023","unstructured":"Peng Xu, Xiatian Zhu, and David A Clifton. 2023. Multimodal learning with transformers: A survey. IEEE Transactions on Pattern Analysis and Machine Intelligence (2023)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3512527.3531385"},{"key":"e_1_3_2_1_34_1","volume-title":"An evaluation of statistical approaches to text categorization. Information retrieval","author":"Yang Yiming","year":"1999","unstructured":"Yiming Yang. 1999. An evaluation of statistical approaches to text categorization. Information retrieval, Vol. 1, 1 (1999), 69--90."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2023.3282704"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 2nd International Workshop on Multimodal and Responsible Affective Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3689092.3689409","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3689092.3689409","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T01:59:03Z","timestamp":1755914343000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3689092.3689409"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":35,"alternative-id":["10.1145\/3689092.3689409","10.1145\/3689092"],"URL":"https:\/\/doi.org\/10.1145\/3689092.3689409","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}