{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T16:02:43Z","timestamp":1775836963672,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":45,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612528","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:12Z","timestamp":1698391632000},"page":"3926-3933","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":13,"title":["Your tone speaks louder than your face! Modality Order Infused Multi-modal Sarcasm Detection"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-1176-005X","authenticated-orcid":false,"given":"Mohit","family":"Tomar","sequence":"first","affiliation":[{"name":"Indian Institute of Technology, Patna, Patna, India"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3460-1624","authenticated-orcid":false,"given":"Abhisek","family":"Tiwari","sequence":"additional","affiliation":[{"name":"Indian Institute of Technology, Patna, Patna, India"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3252-0997","authenticated-orcid":false,"given":"Tulika","family":"Saha","sequence":"additional","affiliation":[{"name":"University of Liverpool, Liverpool, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5458-9381","authenticated-orcid":false,"given":"Sriparna","family":"Saha","sequence":"additional","affiliation":[{"name":"Indian Institute of Technology, Patna, Patna, India"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Yuhao Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems, Vol. 33 (2020), 12449--12460."},{"key":"e_1_3_2_2_2_1","volume-title":"Md Shad Akhtar, and Tanmoy Chakraborty","author":"Bedi Manjot","year":"2021","unstructured":"Manjot Bedi, Shivani Kumar, Md Shad Akhtar, and Tanmoy Chakraborty. 2021. Multi-modal sarcasm detection and humor classification in code-mixed conversations. IEEE Transactions on Affective Computing (2021)."},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1609\/aimag.v40i4.4813"},{"key":"e_1_3_2_2_4_1","volume-title":"Towards multimodal sarcasm detection (an _obviously_ perfect paper). arXiv preprint arXiv:1906.01815","author":"Castro Santiago","year":"2019","unstructured":"Santiago Castro, Devamanyu Hazarika, Ver\u00f3nica P\u00e9rez-Rosas, Roger Zimmermann, Rada Mihalcea, and Soujanya Poria. 2019. Towards multimodal sarcasm detection (an _obviously_ perfect paper). arXiv preprint arXiv:1906.01815 (2019)."},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.401"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3331184.3331254"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.5555\/1870568.1870582"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01101"},{"key":"e_1_3_2_2_9_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3297001.3297043"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/1873951.1874246"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2021.116398"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1162\/coli_a_00336"},{"key":"e_1_3_2_2_14_1","volume-title":"Advantages, Limitations, and Future Directions in Natural Language Processing. arXiv preprint arXiv:2304.02017","author":"Hariri Walid","year":"2023","unstructured":"Walid Hariri. 2023. Unlocking the Potential of ChatGPT: A Comprehensive Exploration of its Applications, Advantages, Limitations, and Future Directions in Natural Language Processing. arXiv preprint arXiv:2304.02017 (2023)."},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i14.17534"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_2_17_1","volume-title":"International Conference on Machine Learning. PMLR, 2790--2799","author":"Houlsby Neil","year":"2019","unstructured":"Neil Houlsby, Andrei Giurgiu, Stanislaw Jastrzebski, Bruna Morrone, Quentin De Laroussilhe, Andrea Gesmundo, Mona Attariyan, and Sylvain Gelly. 2019. Parameter-efficient transfer learning for NLP. In International Conference on Machine Learning. PMLR, 2790--2799."},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3124420"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/P15-2124"},{"key":"e_1_3_2_2_20_1","volume-title":"International Conference on Machine Learning. PMLR, 5583--5594","author":"Kim Wonjae","year":"2021","unstructured":"Wonjae Kim, Bokyung Son, and Ildoo Kim. 2021. Vilt: Vision-and-language transformer without convolution or region supervision. In International Conference on Machine Learning. PMLR, 5583--5594."},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.5555\/1611528.1611529"},{"key":"e_1_3_2_2_22_1","volume-title":"Md Shad Akhtar, and Tanmoy Chakraborty","author":"Kumar Shivani","year":"2022","unstructured":"Shivani Kumar, Atharva Kulkarni, Md Shad Akhtar, and Tanmoy Chakraborty. 2022. When did you become so smart, oh wise one?! Sarcasm Explanation in Multi-modal Multi-party Dialogues. arXiv preprint arXiv:2203.06419 (2022)."},{"key":"e_1_3_2_2_23_1","volume-title":"Bart: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. arXiv preprint arXiv:1910.13461","author":"Lewis Mike","year":"2019","unstructured":"Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov, and Luke Zettlemoyer. 2019. Bart: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. arXiv preprint arXiv:1910.13461 (2019)."},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3240508.3240605"},{"key":"e_1_3_2_2_25_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-emnlp.74"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.25080\/Majora-7b98e3ed-003"},{"key":"e_1_3_2_2_28_1","first-page":"14200","article-title":"Attention bottlenecks for multimodal fusion","volume":"34","author":"Nagrani Arsha","year":"2021","unstructured":"Arsha Nagrani, Shan Yang, Anurag Arnab, Aren Jansen, Cordelia Schmid, and Chen Sun. 2021. Attention bottlenecks for multimodal fusion. Advances in Neural Information Processing Systems, Vol. 34 (2021), 14200--14213.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"e_1_3_2_2_30_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. 2019. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems, Vol. 32 (2019)."},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2017.02.003"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"crossref","unstructured":"Rachel Rakov and Andrew Rosenberg. 2013. \"sure i did the right thing\": a system for sarcasm detection in speech.. In Interspeech. 842--846.","DOI":"10.21437\/Interspeech.2013-239"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1007\/s12559-019-09704-5"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.402"},{"key":"e_1_3_2_2_35_1","volume-title":"Sriparna Saha, and Pushpak Bhattacharyya.","author":"Saha Tulika","year":"2022","unstructured":"Tulika Saha, Aditya Prakash Patra, Sriparna Saha, and Pushpak Bhattacharyya. 2022a. Meta-Learning based Deferred Optimisation for Sentiment and Emotion aware Multi-modal Dialogue Act Classification. In Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing (Volume 1: Long Papers). Association for Computational Linguistics, Online only, 978--990. https:\/\/aclanthology.org\/2022.aacl-main.71"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.456"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSS.2021.3088714"},{"key":"e_1_3_2_2_38_1","volume-title":"Lxmert: Learning cross-modality encoder representations from transformers. arXiv preprint arXiv:1908.07490","author":"Tan Hao","year":"2019","unstructured":"Hao Tan and Mohit Bansal. 2019. Lxmert: Learning cross-modality encoder representations from transformers. arXiv preprint arXiv:1908.07490 (2019)."},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.aacl-main.76"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-006-7834-8"},{"key":"e_1_3_2_2_41_1","first-page":"200","article-title":"Multimodal few-shot learning with frozen language models","volume":"34","author":"Tsimpoukelli Maria","year":"2021","unstructured":"Maria Tsimpoukelli, Jacob L Menick, Serkan Cabi, SM Eslami, Oriol Vinyals, and Felix Hill. 2021. Multimodal few-shot learning with frozen language models. Advances in Neural Information Processing Systems, Vol. 34 (2021), 200--212.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_42_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.562"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"crossref","unstructured":"Thomas Wolf Lysandre Debut Victor Sanh Julien Chaumond Clement Delangue Anthony Moi Pierric Cistac Tim Rault R\u00e9mi Louf Morgan Funtowicz et al. 2019. Huggingface's transformers: State-of-the-art natural language processing. arXiv preprint arXiv:1910.03771 (2019).","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.3301387"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612528","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612528","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T23:57:29Z","timestamp":1755820649000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612528"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":45,"alternative-id":["10.1145\/3581783.3612528","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612528","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}