{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,26]],"date-time":"2026-06-26T20:37:26Z","timestamp":1782506246249,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":41,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-sa\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681599","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:49Z","timestamp":1729925989000},"page":"4795-4804","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":9,"title":["DQ-Former: Querying Transformer with Dynamic Modality Priority for Cognitive-aligned Multimodal Emotion Recognition in Conversation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-7715-8630","authenticated-orcid":false,"given":"Ye","family":"Jing","sequence":"first","affiliation":[{"name":"the State Key Laboratory of Multimodal Artificial Intelligence Systems, Institute of Automation, Chinese Academy of Science &amp; School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-6597-7582","authenticated-orcid":false,"given":"Xinpei","family":"Zhao","sequence":"additional","affiliation":[{"name":"the State Key Laboratory of Multimodal Artificial Intelligence Systems, Institute of Automation, Chinese Academy of Science &amp; School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Yuhao Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations. In Advances in Neural Information Processing Systems, Vol. 33. Curran Associates, Inc., 12449--12460. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2020\/file\/92d1e1eb1cd6f9fba3227870bb6d7f07-Paper.pdf"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/S10579-008--9076--6"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_2_1_5_1","volume-title":"Assessing BERT's Syntactic Abilities. arXiv preprint","author":"Goldberg Yoav","year":"2019","unstructured":"Yoav Goldberg. 2019. Assessing BERT's Syntactic Abilities. arXiv preprint, Vol. abs\/1901.05287 (2019). showeprint[arXiv]1901.05287 http:\/\/arxiv.org\/abs\/1901.05287"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1121\/1.3466853"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548137"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413678"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2024.3356185"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747397"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.440"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1356"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1057\/s41599-020-0499-z"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.996"},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"19742","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. In Proceedings of the 40th International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 202). PMLR, 19730--19742. https:\/\/proceedings.mlr.press\/v202\/li23q.html"},{"key":"e_1_3_2_1_16_1","volume-title":"GA2MIF: Graph and Attention Based Two-Stage Multi-Source Information Fusion for Conversational Emotion Detection","author":"Li Jiang","year":"2023","unstructured":"Jiang Li, Xiaoping Wang, Guoqing Lv, and Zhigang Zeng. 2023. GA2MIF: Graph and Attention Based Two-Stage Multi-Source Information Fusion for Conversational Emotion Detection. IEEE Transactions on Affective Computing (2023). https:\/\/arxiv.org\/pdf\/2207.11900.pdf"},{"key":"e_1_3_2_1_17_1","volume-title":"VisualBERT: A Simple and Performant Baseline for Vision and Language. arXiv preprint","author":"Li Liunian Harold","year":"2019","unstructured":"Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, and Kai-Wei Chang. 2019. VisualBERT: A Simple and Performant Baseline for Vision and Language. arXiv preprint, Vol. abs\/1908.03557 (2019). showeprint[arXiv]1908.03557 http:\/\/arxiv.org\/abs\/1908.03557"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/SLT54892.2023.10023428"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-acl.126"},{"key":"e_1_3_2_1_20_1","volume-title":"RoBERTa: A Robustly Optimized BERT Pretraining Approach. arXiv preprint","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. RoBERTa: A Robustly Optimized BERT Pretraining Approach. arXiv preprint, Vol. abs\/1907.11692 (2019). showeprint[arXiv]1907.11692 http:\/\/arxiv.org\/abs\/1907.11692"},{"key":"e_1_3_2_1_21_1","volume-title":"Decoupled weight decay regularization. arXiv preprint","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint (2017). arxiv: 1711.05101 [cs.LG] http:\/\/arxiv.org\/abs\/1711.05101"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00258"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1609\/AAAI.V33I01.33016818"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-emnlp.229"},{"key":"e_1_3_2_1_25_1","volume-title":"Advances in Neural Information Processing Systems","volume":"34","author":"Nagrani Arsha","year":"2021","unstructured":"Arsha Nagrani, Shan Yang, Anurag Arnab, Aren Jansen, Cordelia Schmid, and Chen Sun. 2021. Attention Bottlenecks for Multimodal Fusion. In Advances in Neural Information Processing Systems, Vol. 34. Curran Associates, Inc., 14200--14213. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2021\/file\/76ba9f564ebbc35b1014ac498fafadd0-Paper.pdf"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688093"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1050"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.824"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.39"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6431"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1656"},{"key":"e_1_3_2_1_32_1","volume-title":"Proceedings of the 31st International Conference on Neural Information Processing Systems","volume":"30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Proceedings of the 31st International Conference on Neural Information Processing Systems (Long Beach, California, USA), Vol. 30. Curran Associates, Inc., 6000--6010. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"},{"key":"e_1_3_2_1_33_1","volume-title":"Multimodal transformer with adaptive modality weighting for multimodal sentiment analysis. Neurocomputing","author":"Wang Yifeng","year":"2023","unstructured":"Yifeng Wang, Jiahao He, Di Wang, Quan Wang, Bo Wan, and Xuemei Luo. 2023. Multimodal transformer with adaptive modality weighting for multimodal sentiment analysis. Neurocomputing (2023), 127--181. https:\/\/www.sciencedirect.com\/science\/article\/abs\/pii\/S0925231223013048"},{"key":"e_1_3_2_1_34_1","volume-title":"Louis-Philippe Morency, Peter Bell, and Catherine Lai.","author":"Wang Yaoting","year":"2023","unstructured":"Yaoting Wang, Yuanchao Li, Paul Pu Liang, Louis-Philippe Morency, Peter Bell, and Catherine Lai. 2023. Cross-Attention is Not Enough: Incongruity-Aware Dynamic Hierarchical Fusion for Multimodal Affect Recognition. arxiv: 2305.13583 [cs.CL] https:\/\/arxiv.org\/abs\/2305.13583"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3551876.3554813"},{"key":"e_1_3_2_1_36_1","unstructured":"Xiao Xu Chenfei Wu Shachar Rosenman Vasudev Lal Wanxiang Che and Nan Duan. 2024. BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning. arxiv: 2206.08657 [cs.CV]"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D17-1115"},{"key":"e_1_3_2_1_38_1","volume-title":"Navonil Mazumder, Soujanya Poria, Erik Cambria, and Louis-Philippe Morency.","author":"Zadeh Amir","year":"2018","unstructured":"Amir Zadeh, Paul Pu Liang, Navonil Mazumder, Soujanya Poria, Erik Cambria, and Louis-Philippe Morency. 2018. Memory fusion network for multi-view sequential learning. In Proceedings of the Thirty-Second AAAI Conference on Artificial Intelligence and Thirtieth Innovative Applications of Artificial Intelligence Conference and Eighth AAAI Symposium on Educational Advances in Artificial Intelligence (New Orleans, Louisiana, USA). AAAI Press, Article 691, 8 pages."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.49"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.732"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.861"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681599","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681599","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:49Z","timestamp":1750295869000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681599"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":41,"alternative-id":["10.1145\/3664647.3681599","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681599","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}