{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,14]],"date-time":"2026-02-14T10:20:36Z","timestamp":1771064436976,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":24,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612846","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:40Z","timestamp":1698391660000},"page":"9467-9471","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Cascaded Cross-Modal Transformer for Request and Complaint Detection"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7880-9307","authenticated-orcid":false,"given":"Nicolae-Catalin","family":"Ristea","sequence":"first","affiliation":[{"name":"University Politehnica of Bucharest, Bucharest, Romania"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9301-1950","authenticated-orcid":false,"given":"Radu Tudor","family":"Ionescu","sequence":"additional","affiliation":[{"name":"University of Bucharest, Bucharest, Romania"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"24206","article-title":"VATT: Transformers for multimodal self-supervised learning from raw video, audio and text","volume":"34","author":"Akbari Hassan","year":"2021","unstructured":"Hassan Akbari, Liangzhe Yuan, Rui Qian, Wei-Hong Chuang, Shih-Fu Chang, Yin Cui, and Boqing Gong. 2021. VATT: Transformers for multimodal self-supervised learning from raw video, audio and text. Proceedings of NeurIPS, Vol. 34 (2021), 24206--24221.","journal-title":"Proceedings of NeurIPS"},{"key":"e_1_3_2_1_2_1","first-page":"12449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume":"33","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Yuhao Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A framework for self-supervised learning of speech representations. Proceedings of NeurIPS, Vol. 33 (2020), 12449--12460.","journal-title":"Proceedings of NeurIPS"},{"key":"e_1_3_2_1_3_1","volume-title":"Proceedings of PML4DC (ICLR Workshop).","author":"Ca\u00f1ete Jos\u00e9","year":"2020","unstructured":"Jos\u00e9 Ca\u00f1ete, Gabriel Chaperon, Rodrigo Fuentes, Jou-Hui Ho, Hojin Kang, and Jorge P\u00e9rez. 2020. Spanish Pre-Trained BERT Model and Evaluation Data. In Proceedings of PML4DC (ICLR Workshop)."},{"key":"e_1_3_2_1_4_1","unstructured":"Hyung Won Chung Le Hou Shayne Longpre Barret Zoph Yi Tay William Fedus Eric Li Xuezhi Wang Mostafa Dehghani Siddhartha Brahma et al. 2022. Scaling instruction-finetuned language models. arXiv preprint arXiv:2210.11416 (2022)."},{"key":"e_1_3_2_1_5_1","volume-title":"Multimodal sentiment analysis: A survey of methods, trends and challenges. Comput. Surveys","author":"Das Ringki","year":"2023","unstructured":"Ringki Das and Thoudam Doren Singh. 2023. Multimodal sentiment analysis: A survey of methods, trends and challenges. Comput. Surveys (2023)."},{"key":"e_1_3_2_1_6_1","volume-title":"Proceedings of NAACL-HLT. 4171--4186","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Lee Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of NAACL-HLT. 4171--4186."},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings of ICLR.","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In Proceedings of ICLR."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco_a_01273"},{"key":"e_1_3_2_1_9_1","volume-title":"Mario Lucic, Cordelia Schmid, and Anurag Arnab.","author":"Georgescu Mariana-Iuliana","year":"2022","unstructured":"Mariana-Iuliana Georgescu, Eduardo Fonseca, Radu Tudor Ionescu, Mario Lucic, Cordelia Schmid, and Anurag Arnab. 2022. Audiovisual Masked Autoencoders. arXiv preprint arXiv:2212.05922 (2022)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-698"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3545572"},{"key":"e_1_3_2_1_14_1","volume-title":"Proceedings of ICLR.","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. In Proceedings of ICLR."},{"key":"e_1_3_2_1_15_1","volume-title":"Prediction of User Request and Complaint in Spoken Customer-Agent Conversations. arXiv preprint arXiv:2208.10249","author":"Lackovic Nikola","year":"2022","unstructured":"Nikola Lackovic, Claude Montaci\u00e9, Gauthier Lalande, and Marie-Jos\u00e9 Caraty. 2022. Prediction of User Request and Complaint in Spoken Customer-Agent Conversations. arXiv preprint arXiv:2208.10249 (2022)."},{"key":"e_1_3_2_1_16_1","volume-title":"Proceedings of LREC. 2479--2490","author":"Le Hang","year":"2020","unstructured":"Hang Le, Lo\u00efc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoit Crabb\u00e9, Laurent Besacier, and Didier Schwab. 2020. FlauBERT: Unsupervised Language Model Pre-training for French. In Proceedings of LREC. 2479--2490."},{"key":"e_1_3_2_1_17_1","volume-title":"RoBERTa: A Robustly Optimized BERT Pretraining Approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. RoBERTa: A Robustly Optimized BERT Pretraining Approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.645"},{"key":"e_1_3_2_1_19_1","volume-title":"Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever.","author":"Radford Alec","year":"2022","unstructured":"Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever. 2022. Robust speech recognition via large-scale weak supervision. arXiv preprint arXiv:2212.04356 (2022)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2017.2738401"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-249"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Bj\u00f6rn W Schuller Anton Batliner Shahin Amiriparian Alexander Barnhill Maurice Gerczuk Andreas Triantafyllopoulos Alice Baird Panagiotis Tzirakis Chris Gagne Alan S. Cowen et al. 2023. The ACM Multimedia 2023 Computational Paralinguistics Challenge: Emotion Share & Requests. arXiv preprint arXiv:2304.14882 (2023).","DOI":"10.1145\/3581783.3612835"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1093\/bib\/bbab569"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639583"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612846","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612846","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:14:06Z","timestamp":1755821646000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612846"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":24,"alternative-id":["10.1145\/3581783.3612846","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612846","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}