{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T15:36:17Z","timestamp":1768318577267,"version":"3.49.0"},"publisher-location":"Cham","reference-count":144,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726484","type":"print"},{"value":"9783031726491","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72649-1_10","type":"book-chapter","created":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T07:01:50Z","timestamp":1727593310000},"page":"165-186","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["Facial Affective Behavior Analysis with\u00a0Instruction Tuning"],"prefix":"10.1007","author":[{"given":"Yifan","family":"Li","sequence":"first","affiliation":[]},{"given":"Anh","family":"Dao","sequence":"additional","affiliation":[]},{"given":"Wentao","family":"Bao","sequence":"additional","affiliation":[]},{"given":"Zhen","family":"Tan","sequence":"additional","affiliation":[]},{"given":"Tianlong","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Huan","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Yu","family":"Kong","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,9,30]]},"reference":[{"key":"10_CR1","unstructured":"Alayrac, J.B., et\u00a0al.: Flamingo: a visual language model for few-shot learning. In: Advances in Neural Information Processing Systems (2022)"},{"key":"10_CR2","unstructured":"Bai, J., et\u00a0al.: Qwen technical report (2023). arXiv preprint arXiv:2309.16609"},{"key":"10_CR3","unstructured":"Bai, J., et al.: Qwen-vl: A frontier large vision-language model with versatile abilities (2023). arXiv preprint arXiv:2308.12966"},{"key":"10_CR4","doi-asserted-by":"publisher","first-page":"373","DOI":"10.1146\/annurev.psych.58.110405.085709","volume":"58","author":"LF Barrett","year":"2007","unstructured":"Barrett, L.F., Mesquita, B., Ochsner, K.N., Gross, J.J.: The experience of emotion. Annu. Rev. Psychol. 58, 373\u2013403 (2007)","journal-title":"Annu. Rev. Psychol."},{"key":"10_CR5","doi-asserted-by":"crossref","unstructured":"Chang, Y., Wang, S.: Knowledge-driven self-supervised representation learning for facial action unit recognition. In: CVPR, pp. 20417\u201320426 (2022)","DOI":"10.1109\/CVPR52688.2022.01977"},{"key":"10_CR6","unstructured":"Chen, F., et al.: X-LLM: Bootstrapping advanced large language models by treating multi-modalities as foreign languages (2023). arXiv preprint arXiv:2305.04160"},{"key":"10_CR7","doi-asserted-by":"crossref","unstructured":"Chen, F., Shao, J., Zhu, S., Shen, H.T.: Multivariate, multi-frequency and multimodal: rethinking graph neural networks for emotion recognition in conversation. In: CVPR, pp. 10761\u201310770 (2023)","DOI":"10.1109\/CVPR52729.2023.01036"},{"key":"10_CR8","unstructured":"Chen, J., Zhang, A., Shi, X., Li, M., Smola, A., Yang, D.: Parameter-efficient fine-tuning design spaces (2023). arXiv preprint arXiv:2301.01821"},{"key":"10_CR9","unstructured":"Chen, J., et al.: MiniGPT-v2: large language model as a unified interface for vision-language multi-task learning (2023). arXiv preprint arXiv:2310.09478"},{"key":"10_CR10","unstructured":"Chen, K., Zhang, Z., Zeng, W., Zhang, R., Zhu, F., Zhao, R.: Shikra: Unleashing multimodal LLM\u2019s referential dialogue magic (2023). arXiv preprint arXiv:2306.15195"},{"key":"10_CR11","unstructured":"Chen, R., Zhang, H., Liang, S., Li, J., Cao, X.: Less is more: Fewer interpretable region via submodular subset selection. In: ICLR (2024)"},{"key":"10_CR12","unstructured":"Chen, T., Saxena, S., Li, L., Fleet, D.J., Hinton, G.: Pix2seq: a language modeling framework for object detection. In: ICLR (2021)"},{"key":"10_CR13","unstructured":"Chen, Z., et\u00a0al.: InternVL: Scaling up vision foundation models and aligning for generic visual-linguistic tasks (2023). arXiv preprint arXiv:2312.14238"},{"key":"10_CR14","unstructured":"Chiang, W.L., et al.: Vicuna: An open-source chatbot impressing GPT-4 with 90%* chatgpt quality (2023). https:\/\/lmsys.org\/blog\/2023-03-30-vicuna\/"},{"key":"10_CR15","unstructured":"Cho, J., Lei, J., Tan, H., Bansal, M.: Unifying vision-and-language tasks via text generation. In: ICML, pp. 1931\u20131942 (2021)"},{"key":"10_CR16","unstructured":"Chung, H.W., et\u00a0al.: Scaling instruction-finetuned language models (2022). arXiv preprint arXiv:2210.11416"},{"key":"10_CR17","doi-asserted-by":"crossref","unstructured":"Corneanu, C., Madadi, M., Escalera, S.: Deep structure inference network for facial action unit recognition. In: ECCV, pp. 298\u2013313 (2018)","DOI":"10.1007\/978-3-030-01258-8_19"},{"key":"10_CR18","doi-asserted-by":"crossref","unstructured":"Cui, Z., Kuang, C., Gao, T., Talamadupula, K., Ji, Q.: Biomechanics-guided facial action unit detection through force modeling. In: CVPR, pp. 8694\u20138703 (2023)","DOI":"10.1109\/CVPR52729.2023.00840"},{"key":"10_CR19","unstructured":"Dai, W., et al.: Instructblip: Towards general-purpose vision-language models with instruction tuning (2023)"},{"key":"10_CR20","doi-asserted-by":"crossref","unstructured":"Deng, J., Guo, J., Xue, N., Zafeiriou, S.: ArcFace: additive angular margin loss for deep face recognition. In: CVPR, pp. 4690\u20134699 (2019)","DOI":"10.1109\/CVPR.2019.00482"},{"key":"10_CR21","unstructured":"Dettmers, T., Pagnoni, A., Holtzman, A., Zettlemoyer, L.: QLoRA: efficient finetuning of quantized LLMs. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"10_CR22","doi-asserted-by":"crossref","unstructured":"Dhall, A., Goecke, R., Lucey, S., Gedeon, T.: Static facial expression analysis in tough conditions: Data, evaluation protocol and benchmark. In: IEEE International Conference on Computer Vision Workshops, pp. 2106\u20132112 (2011)","DOI":"10.1109\/ICCVW.2011.6130508"},{"key":"10_CR23","unstructured":"Dong, R., et\u00a0al.: DreamLLM: Synergistic multimodal comprehension and creation (2023). arXiv preprint arXiv:2309.11499"},{"key":"10_CR24","unstructured":"t Dosovitskiy, A., et\u00a0al.: An image is worth 16x16 words: transformers for image recognition at scale. In: ICLR (2020)"},{"key":"10_CR25","doi-asserted-by":"crossref","unstructured":"Ekman, P., Friesen, W.V.: Facial action coding system. Environmental Psychology & Nonverbal Behavior (1978)","DOI":"10.1037\/t27734-000"},{"key":"10_CR26","doi-asserted-by":"crossref","unstructured":"Ekman, P., et al.: Basic Emotions. Handbook of Cognition and Emotion, vol. 98, no. (45\u201360), pp. 16 (1999)","DOI":"10.1002\/0470013494.ch3"},{"key":"10_CR27","doi-asserted-by":"crossref","unstructured":"Fabian Benitez-Quiroz, C., Srinivasan, R., Martinez, A.M.: EmotioNet: an accurate, real-time algorithm for the automatic annotation of a million facial expressions in the wild. In: CVPR, pp. 5562\u20135570 (2016)","DOI":"10.1109\/CVPR.2016.600"},{"key":"10_CR28","first-page":"65","volume":"61","author":"A Gatt","year":"2018","unstructured":"Gatt, A., Krahmer, E.: Survey of the state of the art in natural language generation: core tasks, applications and evaluation. Jour. Art. Intel. Resea. 61, 65\u2013170 (2018)","journal-title":"Jour. Art. Intel. Resea."},{"key":"10_CR29","doi-asserted-by":"crossref","unstructured":"Girard, J.M., Chu, W.S., Jeni, L.A., Cohn, J.F.: Sayette group formation task (GFT) spontaneous facial expression database. In: IEEE FG, pp. 581\u2013588 (2017)","DOI":"10.1109\/FG.2017.144"},{"key":"10_CR30","doi-asserted-by":"crossref","unstructured":"Goodfellow, I.J., et\u00a0al.: Challenges in representation learning: a report on three machine learning contests. In: Advances in Neural Information Processing Systems, pp. 117\u2013124 (2013)","DOI":"10.1007\/978-3-642-42051-1_16"},{"issue":"2","key":"10_CR31","doi-asserted-by":"publisher","first-page":"484","DOI":"10.1016\/j.concog.2008.03.019","volume":"17","author":"D Grandjean","year":"2008","unstructured":"Grandjean, D., Sander, D., Scherer, K.R.: Conscious emotional experience emerges as a function of multilevel, appraisal-driven response synchronization. Conscious. Cogn. 17(2), 484\u2013495 (2008)","journal-title":"Conscious. Cogn."},{"key":"10_CR32","doi-asserted-by":"crossref","unstructured":"Guo, D., Rush, A.M., Kim, Y.: Parameter-efficient transfer learning with diff pruning (2020). arXiv preprint arXiv:2012.07463","DOI":"10.18653\/v1\/2021.acl-long.378"},{"issue":"3","key":"10_CR33","doi-asserted-by":"publisher","first-page":"225","DOI":"10.1080\/026999399379267","volume":"13","author":"J Haidt","year":"1999","unstructured":"Haidt, J., Keltner, D.: Culture and facial expression: open-ended methods find more expressions and a gradient of recognition. Cogn. Emot. 13(3), 225\u2013266 (1999)","journal-title":"Cogn. Emot."},{"key":"10_CR34","unstructured":"He, J., Zhou, C., Ma, X., Berg-Kirkpatrick, T., Neubig, G.: Towards a unified view of parameter-efficient transfer learning (2021). arXiv preprint arXiv:2110.04366"},{"key":"10_CR35","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., Girshick, R.: Momentum contrast for unsupervised visual representation learning. In: CVPR, pp. 9729\u20139738 (2020)","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"10_CR36","doi-asserted-by":"crossref","unstructured":"He, S., Ding, L., Dong, D., Zhang, M., Tao, D.: SparseAdapter: An easy approach for improving the parameter-efficiency of adapters (2022). arXiv preprint arXiv:2210.04284","DOI":"10.18653\/v1\/2022.findings-emnlp.160"},{"key":"10_CR37","unstructured":"Houlsby, N., et al.: Parameter-efficient transfer learning for NLP. In: ICML, pp. 2790\u20132799 (2019)"},{"key":"10_CR38","unstructured":"Hu, E.J., et al.: LoRA: Low-rank adaptation of large language models (2021). arXiv preprint arXiv:2106.09685"},{"key":"10_CR39","doi-asserted-by":"publisher","unstructured":"Izard, C.E.: Human emotions. Springer Science & Business Media (2013). https:\/\/doi.org\/10.1007\/978-1-4899-2209-0","DOI":"10.1007\/978-1-4899-2209-0"},{"key":"10_CR40","unstructured":"Jacob, G.M., Stenger, B.: Facial action unit detection with transformers. In: CVPR, pp. 7680\u20137689 (2021)"},{"key":"10_CR41","unstructured":"Jiang, A.Q., et\u00a0al.: Mistral 7B (2023). arXiv preprint arXiv:2310.06825"},{"key":"10_CR42","doi-asserted-by":"crossref","unstructured":"Jiang, X., et al.: DFEW: a large-scale database for recognizing dynamic facial expressions in the wild. In: ACM MM, pp. 2881\u20132889 (2020)","DOI":"10.1145\/3394171.3413620"},{"key":"10_CR43","first-page":"1022","volume":"34","author":"R Karimi Mahabadi","year":"2021","unstructured":"Karimi Mahabadi, R., Henderson, J., Ruder, S.: Compacter: efficient low-rank hypercomplex adapter layers. Adv. Neural Inform. Process. Syst. 34, 1022\u20131035 (2021)","journal-title":"Adv. Neural Inform. Process. Syst."},{"key":"10_CR44","doi-asserted-by":"crossref","unstructured":"Kollias, D., Schulc, A., Hajiyev, E., Zafeiriou, S.: Analysing affective behavior in the first ABAW 2020 competition. In: IEEE FG, pp. 637\u2013643 (2020)","DOI":"10.1109\/FG47880.2020.00126"},{"key":"10_CR45","unstructured":"Kollias, D., Zafeiriou, S.: Aff-Wild2: Extending the Aff-Wild database for affect recognition (2018). arXiv preprint arXiv:1811.07770"},{"key":"10_CR46","doi-asserted-by":"crossref","unstructured":"Lai, X., et al.: LISA: Reasoning segmentation via large language model (2023). arXiv preprint arXiv:2308.00692","DOI":"10.1109\/CVPR52733.2024.00915"},{"key":"10_CR47","doi-asserted-by":"crossref","unstructured":"Lester, B., Al-Rfou, R., Constant, N.: The power of scale for parameter-efficient prompt tuning (2021). arXiv preprint arXiv:2104.08691","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"10_CR48","unstructured":"Li, B., Zhang, Y., Chen, L., Wang, J., Yang, J., Liu, Z.: Otter: A multi-modal model with in-context instruction tuning (2023). arXiv preprint arXiv:2305.03726"},{"key":"10_CR49","doi-asserted-by":"crossref","unstructured":"Li, G., Zhu, X., Zeng, Y., Wang, Q., Lin, L.: Semantic relationships guided representation learning for facial action unit recognition. In: AAAI, pp. 8594\u20138601 (2019)","DOI":"10.1609\/aaai.v33i01.33018594"},{"key":"10_CR50","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models (2023). arXiv preprint arXiv:2301.12597"},{"key":"10_CR51","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: Blip: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: ICML, pp. 12888\u201312900 (2022)"},{"issue":"1","key":"10_CR52","first-page":"356","volume":"28","author":"S Li","year":"2019","unstructured":"Li, S., Deng, W.: Reliable crowdsourcing and deep locality-preserving learning for unconstrained facial expression recognition. IEEE TIP 28(1), 356\u2013370 (2019)","journal-title":"IEEE TIP"},{"issue":"3","key":"10_CR53","doi-asserted-by":"publisher","first-page":"1195","DOI":"10.1109\/TAFFC.2020.2981446","volume":"13","author":"S Li","year":"2020","unstructured":"Li, S., Deng, W.: Deep facial expression recognition: a survey. IEEE Trans. Affect. Comput. 13(3), 1195\u20131215 (2020)","journal-title":"IEEE Trans. Affect. Comput."},{"key":"10_CR54","doi-asserted-by":"crossref","unstructured":"Li, S., Deng, W., Du, J.: Reliable crowdsourcing and deep locality-preserving learning for expression recognition in the wild. In: CVPR, pp. 2584\u20132593 (2017)","DOI":"10.1109\/CVPR.2017.277"},{"issue":"11","key":"10_CR55","doi-asserted-by":"publisher","first-page":"2583","DOI":"10.1109\/TPAMI.2018.2791608","volume":"40","author":"W Li","year":"2018","unstructured":"Li, W., Abtahi, F., Zhu, Z., Yin, L.: EAC-Net: deep nets with enhancing and cropping for facial action unit detection. IEEE TPAMI 40(11), 2583\u20132596 (2018)","journal-title":"IEEE TPAMI"},{"key":"10_CR56","doi-asserted-by":"crossref","unstructured":"Li, X.L., Liang, P.: Prefix-tuning: Optimizing continuous prompts for generation (2021). arXiv preprint arXiv:2101.00190","DOI":"10.18653\/v1\/2021.acl-long.353"},{"key":"10_CR57","unstructured":"Li, X., Behpour, S., Doan, T.L., He, W., Gou, L., Ren, L.: UP-DP: unsupervised prompt learning for data pre-selection with vision-language models. In: Advances in Neural Information Processing Systems, vol.\u00a036 (2024)"},{"key":"10_CR58","doi-asserted-by":"crossref","unstructured":"Li, X., Pan, D., Li, C., Qiang, Y., Zhu, D.: Negative flux aggregation to estimate feature attributions. In: IJCAI (2023)","DOI":"10.24963\/ijcai.2023\/50"},{"key":"10_CR59","unstructured":"Li, Y., Han, H., Shan, S., Ji, Z., Bai, J., Chen, X.: ReCoT: regularized co-training for facial action unit recognition with noisy labels. In: BMVC (2023)"},{"key":"10_CR60","doi-asserted-by":"crossref","unstructured":"Li, Y., Sun, H., Liu, Z., Han, H., Shan, S.: Affective behaviour analysis using pretrained model with facial prior. In: European Conference on Computer Vision Workshops, pp. 19\u201330 (2022)","DOI":"10.1007\/978-3-031-25075-0_2"},{"key":"10_CR61","doi-asserted-by":"crossref","unstructured":"Li, Y., Wang, Y., Cui, Z.: Decoupled multimodal distilling for emotion recognition. In: CVPR, pp. 6631\u20136640 (2023)","DOI":"10.1109\/CVPR52729.2023.00641"},{"issue":"5","key":"10_CR62","first-page":"2439","volume":"28","author":"Y Li","year":"2018","unstructured":"Li, Y., Zeng, J., Shan, S., Chen, X.: Occlusion aware facial expression recognition using CNN with attention mechanism. IEEE TIP 28(5), 2439\u20132450 (2018)","journal-title":"IEEE TIP"},{"key":"10_CR63","doi-asserted-by":"crossref","unstructured":"Li, Y., Zeng, J., Shan, S., Chen, X.: Self-supervised representation learning from videos for facial action unit detection. In: CVPR, pp. 10924\u201310933 (2019)","DOI":"10.1109\/CVPR.2019.01118"},{"key":"10_CR64","unstructured":"Lin, C.Y.: ROUGE: a package for automatic evaluation of summaries. In: Text summarization branches out, pp. 74\u201381 (2004)"},{"key":"10_CR65","unstructured":"Lin, J., et al.: VILA: On pre-training for visual language models (2023). arXiv preprint arXiv:2312.07533"},{"key":"10_CR66","doi-asserted-by":"crossref","unstructured":"Lin, Z., Madotto, A., Fung, P.: Exploring versatile generative language model via parameter-efficient transfer learning (2020). arXiv preprint arXiv:2004.03829","DOI":"10.18653\/v1\/2020.findings-emnlp.41"},{"key":"10_CR67","first-page":"1950","volume":"35","author":"H Liu","year":"2022","unstructured":"Liu, H., et al.: Few-shot parameter-efficient fine-tuning is better and cheaper than in-context learning. Adv. Neural Inform. Process. Syst. 35, 1950\u20131965 (2022)","journal-title":"Adv. Neural Inform. Process. Syst."},{"key":"10_CR68","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning (2023). arXiv preprint arXiv:2310.03744","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"10_CR69","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"10_CR70","doi-asserted-by":"crossref","unstructured":"Liu, M., Shan, S., Wang, R., Chen, X.: Learning expressionlets on spatio-temporal manifold for dynamic facial expression recognition. In: CVPR, pp. 1749\u20131756 (2014)","DOI":"10.1109\/CVPR.2014.226"},{"key":"10_CR71","unstructured":"Liu, Q., et al.: MoELoRA: An MOE-based parameter efficient fine-tuning method for multi-task medical applications (2023). arXiv preprint arXiv:2310.18339"},{"key":"10_CR72","doi-asserted-by":"crossref","unstructured":"Liu, Y., et al.: MAFW: a large-scale, multi-modal, compound affective database for dynamic facial expression recognition in the wild. In: ACM MM, pp. 24\u201332 (2022)","DOI":"10.1145\/3503161.3548190"},{"key":"10_CR73","doi-asserted-by":"crossref","unstructured":"Lopes, A.T., De\u00a0Aguiar, E., De\u00a0Souza, A.F., Oliveira-Santos, T.: Facial expression recognition with convolutional neural networks: coping with few data and the training sample order. PR 61, 610\u2013628 (2017)","DOI":"10.1016\/j.patcog.2016.07.026"},{"key":"10_CR74","doi-asserted-by":"crossref","unstructured":"Lu, H., et\u00a0al.: GPT as psychologist? Preliminary evaluations for GPT-4V on visual affective computing (2024). arXiv preprint arXiv:2403.05916","DOI":"10.1109\/CVPRW63382.2024.00037"},{"key":"10_CR75","unstructured":"Lu, L., Tavabi, L., Soleymani, M.: Self-supervised learning for facial action unit recognition through temporal consistency. In: BMVC (2020)"},{"key":"10_CR76","doi-asserted-by":"crossref","unstructured":"Lucey, P., Cohn, J.F., Kanade, T., Saragih, J., Ambadar, Z., Matthews, I.: The extended Cohn-Kanade dataset (CK+): a complete dataset for action unit and emotion-specified expression. In: CVPRW, pp. 94\u2013101 (2010)","DOI":"10.1109\/CVPRW.2010.5543262"},{"key":"10_CR77","doi-asserted-by":"crossref","unstructured":"Luo, C., Song, S., Xie, W., Shen, L., Gunes, H.: Learning multi-dimensional edge feature-based au relation graph for facial action unit recognition (2022). arXiv preprint arXiv:2205.01782","DOI":"10.24963\/ijcai.2022\/173"},{"key":"10_CR78","unstructured":"Mahabadi, R.K., Ruder, S., Dehghani, M., Henderson, J.: Parameter-efficient multi-task fine-tuning for transformers via shared hypernetworks (2021). arXiv preprint arXiv:2106.04489"},{"key":"10_CR79","unstructured":"Mao, J., Xu, R., Yin, X., Chang, Y., Nie, B., Huang, A.: Poster V2: A simpler and stronger facial expression recognition network (2023). arXiv preprint arXiv:2301.12149"},{"key":"10_CR80","doi-asserted-by":"crossref","unstructured":"Mao, Y., et al.: UniPELT: A unified framework for parameter-efficient language model tuning (2021). arXiv preprint arXiv:2110.07577","DOI":"10.18653\/v1\/2022.acl-long.433"},{"issue":"3","key":"10_CR81","doi-asserted-by":"publisher","first-page":"325","DOI":"10.1109\/TAFFC.2017.2731763","volume":"10","author":"B Martinez","year":"2017","unstructured":"Martinez, B., Valstar, M.F., Jiang, B., Pantic, M.: Automatic analysis of facial actions: a survey. IEEE Trans. Affect. Comput. 10(3), 325\u2013347 (2017)","journal-title":"IEEE Trans. Affect. Comput."},{"issue":"2","key":"10_CR82","doi-asserted-by":"publisher","first-page":"151","DOI":"10.1109\/T-AFFC.2013.4","volume":"4","author":"SM Mavadati","year":"2013","unstructured":"Mavadati, S.M., Mahoor, M.H., Bartlett, K., Trinh, P., Cohn, J.F.: DISFA: a spontaneous facial action intensity database. IEEE Trans. Affect. Comput. 4(2), 151\u2013160 (2013)","journal-title":"IEEE Trans. Affect. Comput."},{"issue":"1","key":"10_CR83","doi-asserted-by":"publisher","first-page":"18","DOI":"10.1109\/TAFFC.2017.2740923","volume":"10","author":"A Mollahosseini","year":"2017","unstructured":"Mollahosseini, A., Hasani, B., Mahoor, M.H.: AffectNet: a database for facial expression, valence, and arousal computing in the wild. IEEE Trans. Affect. Comput. 10(1), 18\u201331 (2017)","journal-title":"IEEE Trans. Affect. Comput."},{"key":"10_CR84","unstructured":"Niu, X., Han, H., Shan, S., Chen, X.: Multi-label co-regularization for semi-supervised facial action unit recognition. In: Advances in Neural Information Processing Systems, pp. 909\u2013919 (2019)"},{"key":"10_CR85","doi-asserted-by":"crossref","unstructured":"Niu, X., Han, H., Yang, S., Huang, Y., Shan, S.: Local relationship learning with person-specific shape regularization for facial action unit detection. In: CVPR, pp. 11917\u201311926 (2019)","DOI":"10.1109\/CVPR.2019.01219"},{"key":"10_CR86","unstructured":"Pantic, M., Valstar, M., Rademaker, R., Maat, L.: Web-based database for facial expression analysis. In: ICME, pp. 5\u2013pp (2005)"},{"key":"10_CR87","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: BLEU: a method for automatic evaluation of machine translation. In: Association for Computational Linguistics, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"10_CR88","doi-asserted-by":"crossref","unstructured":"Pfeiffer, J., Kamath, A., R\u00fcckl\u00e9, A., Cho, K., Gurevych, I.: AdapterFusion: Non-destructive task composition for transfer learning (2020). arXiv preprint arXiv:2005.00247","DOI":"10.18653\/v1\/2021.eacl-main.39"},{"issue":"4","key":"10_CR89","doi-asserted-by":"publisher","first-page":"361","DOI":"10.1111\/j.1467-9450.2011.00879.x","volume":"52","author":"TS Pixton","year":"2011","unstructured":"Pixton, T.S.: Happy to see me, aren\u2019t you, sally? Signal detection analysis of emotion detection in briefly presented male and female faces. Scand. J. Psychol. 52(4), 361\u2013368 (2011)","journal-title":"Scand. J. Psychol."},{"key":"10_CR90","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: ICML, pp. 8748\u20138763 (2021)"},{"key":"10_CR91","doi-asserted-by":"crossref","unstructured":"R\u00fcckl\u00e9, A., et al.: AdapterDrop: On the efficiency of adapters in transformers (2020). arXiv preprint arXiv:2010.11918","DOI":"10.18653\/v1\/2021.emnlp-main.626"},{"issue":"6","key":"10_CR92","doi-asserted-by":"publisher","first-page":"1161","DOI":"10.1037\/h0077714","volume":"39","author":"JA Russell","year":"1980","unstructured":"Russell, J.A.: A circumplex model of affect. J. Pers. Soc. Psychol. 39(6), 1161 (1980)","journal-title":"J. Pers. Soc. Psychol."},{"key":"10_CR93","doi-asserted-by":"crossref","unstructured":"Saneiro, M., Santos, O.C., Salmeron-Majadas, S., Boticario, J.G., et\u00a0al.: Towards emotion detection in educational scenarios from facial expressions and body movements through multimodal approaches. Sci. World J. 2014, 484873 (2014)","DOI":"10.1155\/2014\/484873"},{"key":"10_CR94","doi-asserted-by":"crossref","unstructured":"Sankaran, N., Mohan, D.D., Setlur, S., Govindaraju, V., Fedorishin, D.: Representation learning through cross-modality supervision. In: IEEE FG, pp.\u00a01\u20138 (2019)","DOI":"10.1109\/FG.2019.8756519"},{"key":"10_CR95","unstructured":"Savchenko, A.V.: Frame-level prediction of facial expressions, valence, arousal and action units for mobile devices (2022). arXiv preprint arXiv:2203.13436"},{"issue":"2","key":"10_CR96","doi-asserted-by":"publisher","first-page":"321","DOI":"10.1007\/s11263-020-01378-z","volume":"129","author":"Z Shao","year":"2021","unstructured":"Shao, Z., Liu, Z., Cai, J., Ma, L.: JAA-Net: joint facial action unit detection and face alignment via adaptive attention. IJCV 129(2), 321\u2013340 (2021)","journal-title":"IJCV"},{"key":"10_CR97","doi-asserted-by":"crossref","unstructured":"Shao, Z., Liu, Z., Cai, J., Wu, Y., Ma, L.: Facial action unit detection using attention and relation learning. IEEE Trans. Affect Comput. 13(3), 1274\u20131289 (2019)","DOI":"10.1109\/TAFFC.2019.2948635"},{"key":"10_CR98","unstructured":"Shen, J., Wang, H., Gui, S., Tan, J., Wang, Z., Liu, J.: UMEC: Unified model and embedding compression for efficient recommendation systems. In: International Conference on Learning Representations (2021). https:\/\/openreview.net\/forum?id=BM---bH_RSh"},{"key":"10_CR99","doi-asserted-by":"crossref","unstructured":"Song, T., Cui, Z., Wang, Y., Zheng, W., Ji, Q.: Dynamic probabilistic graph convolution for facial action unit intensity estimation. In: CVPR, pp. 4845\u20134854 (2021)","DOI":"10.1109\/CVPR46437.2021.00481"},{"key":"10_CR100","doi-asserted-by":"crossref","unstructured":"Song, T., Cui, Z., Zheng, W., Ji, Q.: Hybrid message passing with performance-driven structures for facial action unit detection. In: CVPR, pp. 6267\u20136276 (2021)","DOI":"10.1109\/CVPR46437.2021.00620"},{"issue":"2","key":"10_CR101","doi-asserted-by":"publisher","first-page":"1037","DOI":"10.1109\/TAFFC.2020.2986962","volume":"13","author":"B Sun","year":"2020","unstructured":"Sun, B., Cao, S., Li, D., He, J., Yu, L.: Dynamic micro-expression recognition using knowledge distillation. IEEE Trans. Affect. Comput. 13(2), 1037\u20131043 (2020)","journal-title":"IEEE Trans. Affect. Comput."},{"key":"10_CR102","unstructured":"Sun, Q., et al.: Generative pretraining in multimodality (2023). arXiv preprint arXiv:2307.05222"},{"key":"10_CR103","doi-asserted-by":"crossref","unstructured":"Sun, X., Zeng, J., Shan, S.: Emotion-aware contrastive learning for facial action unit detection. In: FG, pp. 01\u201308 (2021)","DOI":"10.1109\/FG52635.2021.9666945"},{"key":"10_CR104","first-page":"24193","volume":"34","author":"YL Sung","year":"2021","unstructured":"Sung, Y.L., Nair, V., Raffel, C.A.: Training neural networks with fixed sparse masks. Adv. Neural Inform. Process. Syst. 34, 24193\u201324205 (2021)","journal-title":"Adv. Neural Inform. Process. Syst."},{"key":"10_CR105","unstructured":"Tan, Z., et al.: Large language models for data annotation: A survey (2024). arXiv preprint arXiv:2402.13446"},{"key":"10_CR106","doi-asserted-by":"crossref","unstructured":"Tang, Y., Zeng, W., Zhao, D., Zhang, H.: PIAP-DF: pixel-interested and anti person-specific facial action unit detection net with discrete feedback learning. In: ICCV, pp. 12899\u201312908 (2021)","DOI":"10.1109\/ICCV48922.2021.01266"},{"key":"10_CR107","unstructured":"Touvron, H., et\u00a0al.: LLaMA: Open and efficient foundation language models (2023). arXiv preprint arXiv:2302.13971"},{"key":"10_CR108","unstructured":"Touvron, H., et\u00a0al.: Llama 2: Open foundation and fine-tuned chat models (2023). arXiv preprint arXiv:2307.09288"},{"key":"10_CR109","first-page":"4057","volume":"29","author":"K Wang","year":"2020","unstructured":"Wang, K., Peng, X., Yang, J., Meng, D., Qiao, Y.: Region attention networks for pose and occlusion robust facial expression recognition. IEEE TIP 29, 4057\u20134069 (2020)","journal-title":"IEEE TIP"},{"key":"10_CR110","unstructured":"Wang, W., et\u00a0al.: Cogvlm: Visual expert for pretrained language models (2023). arXiv preprint arXiv:2311.03079"},{"key":"10_CR111","doi-asserted-by":"crossref","unstructured":"Wang, Y., et al.: FERV39k: a large-scale multi-scene dataset for facial expression recognition in videos. In: CVPR, pp. 20922\u201320931 (2022)","DOI":"10.1109\/CVPR52688.2022.02025"},{"key":"10_CR112","doi-asserted-by":"crossref","unstructured":"Wang, Y., et al.: AdaMix: Mixture-of-adaptations for parameter-efficient model tuning (2022). arXiv preprint arXiv:2210.17451","DOI":"10.18653\/v1\/2022.emnlp-main.388"},{"key":"10_CR113","unstructured":"Wang, Y., et al.: Exploring the reasoning abilities of multimodal large language models (MLLMs): A comprehensive survey on emerging trends in multimodal reasoning (2024). arXiv preprint arXiv:2401.06805"},{"key":"10_CR114","doi-asserted-by":"crossref","unstructured":"Wang, Z., Zeng, F., Liu, S., Zeng, B.: OAENet: oriented attention ensemble for accurate facial expression recognition. PR 112, 107694 (2021)","DOI":"10.1016\/j.patcog.2020.107694"},{"issue":"2","key":"10_CR115","doi-asserted-by":"publisher","first-page":"199","DOI":"10.3390\/biomimetics8020199","volume":"8","author":"Z Wen","year":"2023","unstructured":"Wen, Z., Lin, W., Wang, T., Xu, G.: Distract your attention: multi-head cross attention network for facial expression recognition. Biomimetics 8(2), 199 (2023)","journal-title":"Biomimetics"},{"key":"10_CR116","unstructured":"Wu, S., Fei, H., Qu, L., Ji, W., Chua, T.S.: NExT-GPT: Any-to-any multimodal LLM (2023). CoRR abs\/2309.05519"},{"key":"10_CR117","doi-asserted-by":"crossref","unstructured":"Xie, Q., Luong, M.T., Hovy, E., Le, Q.V.: Self-training with noisy student improves imagenet classification. In: CVPR, pp. 10687\u201310698 (2020)","DOI":"10.1109\/CVPR42600.2020.01070"},{"key":"10_CR118","doi-asserted-by":"crossref","unstructured":"Xie, S., Hu, H., Wu, Y.: Deep multi-path convolutional neural network joint with salient region attention for facial expression recognition. PR 92, 177\u2013191 (2019)","DOI":"10.1016\/j.patcog.2019.03.019"},{"key":"10_CR119","unstructured":"Xu, L., Xie, H., Qin, S.Z.J., Tao, X., Wang, F.L.: Parameter-efficient fine-tuning methods for pretrained language models: A critical review and assessment (2023). arXiv preprint arXiv:2312.12148"},{"key":"10_CR120","doi-asserted-by":"crossref","unstructured":"Xu, R., et al.: Raise a child in large language model: Towards effective and generalizable fine-tuning (2021). arXiv preprint arXiv:2109.05687","DOI":"10.18653\/v1\/2021.emnlp-main.749"},{"key":"10_CR121","doi-asserted-by":"crossref","unstructured":"Xue, F., Wang, Q., Tan, Z., Ma, Z., Guo, G.: Vision transformer with attentive pooling for robust facial expression recognition. IEEE Trans. Affect Comput. 14(4), 3244\u20133256 (2022)","DOI":"10.1109\/TAFFC.2022.3226473"},{"issue":"1","key":"10_CR122","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0086041","volume":"9","author":"WJ Yan","year":"2014","unstructured":"Yan, W.J., et al.: CASME II: an improved spontaneous micro-expression database and the baseline evaluation. PLoS ONE 9(1), e86041 (2014)","journal-title":"PLoS ONE"},{"key":"10_CR123","doi-asserted-by":"crossref","unstructured":"Yang, H., Yin, L., Zhou, Y., Gu, J.: Exploiting semantic embedding and visual feature for facial action unit detection. In: CVPR, pp. 10482\u201310491 (2021)","DOI":"10.1109\/CVPR46437.2021.01034"},{"key":"10_CR124","doi-asserted-by":"crossref","unstructured":"Ye, Q., et al.: mPLUG-Owl2: Revolutionizing multi-modal large language model with modality collaboration (2023). arXiv preprint arXiv:2311.04257","DOI":"10.1109\/CVPR52733.2024.01239"},{"key":"10_CR125","doi-asserted-by":"crossref","unstructured":"Yin, Y., et al.: FG-Net: facial action unit detection with generalizable pyramidal features. In: Winter Conference on Applications of Computer Vision, pp. 6099\u20136108 (2024)","DOI":"10.1109\/WACV57701.2024.00599"},{"key":"10_CR126","unstructured":"You, H., et al.: Ferret: Refer and ground anything anywhere at any granularity. In: ICLR (2023)"},{"key":"10_CR127","unstructured":"Yu, S., et al.: Unified visual transformer compression (2022). arXiv preprint arXiv:2203.08243"},{"key":"10_CR128","unstructured":"Zaken, E.B., Ravfogel, S., Goldberg, Y.: BitFit: Simple parameter-efficient fine-tuning for transformer-based masked language-models (2021). arXiv preprint arXiv:2106.10199"},{"key":"10_CR129","doi-asserted-by":"crossref","unstructured":"Zhang, D., et al.: MM-LLMs: Recent advances in multimodal large language models (2024). arXiv preprint arXiv:2401.13601","DOI":"10.18653\/v1\/2024.findings-acl.738"},{"key":"10_CR130","unstructured":"Zhang, Q., et al.: Adaptive budget allocation for parameter-efficient fine-tuning (2023). arXiv preprint arXiv:2303.10512"},{"key":"10_CR131","doi-asserted-by":"crossref","unstructured":"Zhang, S., Pan, Y., Wang, J.Z.: Learning emotion representations from verbal and nonverbal communication. In: CVPR, pp. 18993\u201319004 (2023)","DOI":"10.1109\/CVPR52729.2023.01821"},{"key":"10_CR132","unstructured":"Zhang, S., et\u00a0al.: OPT: Open pre-trained transformer language models (2022). arXiv preprint arXiv:2205.01068"},{"key":"10_CR133","doi-asserted-by":"crossref","unstructured":"Zhang, X., Yang, H., Wang, T., Li, X., Yin, L.: Multimodal channel-mixing: Channel and spatial masked autoencoder on facial action unit detection. In: Wint. Appl. Comput. Vis, pp. 6077\u20136086 (2024)","DOI":"10.1109\/WACV57701.2024.00597"},{"key":"10_CR134","doi-asserted-by":"crossref","unstructured":"Zhang, X., et al.: A high-resolution spontaneous 3D dynamic facial expression database. In: IEEE FG, pp.\u00a01\u20136 (2013)","DOI":"10.1109\/FG.2013.6553788"},{"issue":"10","key":"10_CR135","doi-asserted-by":"publisher","first-page":"692","DOI":"10.1016\/j.imavis.2014.06.002","volume":"32","author":"X Zhang","year":"2014","unstructured":"Zhang, X., et al.: Bp4d-spontaneous: a high-resolution spontaneous 3D dynamic facial expression database. Ima. Vis. Comput. 32(10), 692\u2013706 (2014)","journal-title":"Ima. Vis. Comput."},{"key":"10_CR136","unstructured":"Zhang, Y., Wang, C., Deng, W.: Relative uncertainty learning for facial expression recognition. In: Advances in Neural Information Processing Systems, pp. 17616\u201317627 (2021)"},{"key":"10_CR137","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Wang, C., Ling, X., Deng, W.: Learn from all: Erasing attention consistency for noisy label facial expression recognition. In: ECCV, pp. 418\u2013434 (2022)","DOI":"10.1007\/978-3-031-19809-0_24"},{"key":"10_CR138","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Wang, L., Yang, J.: Weakly supervised video emotion detection and prediction via cross-modal temporal erasing network. In: CVPR, pp. 18888\u201318897 (2023)","DOI":"10.1109\/CVPR52729.2023.01811"},{"key":"10_CR139","doi-asserted-by":"crossref","unstructured":"Zhao, K., Chu, W.S., Martinez, A.M.: Learning facial action units from web images with scalable weakly supervised clustering. In: CVPR, pp. 2090\u20132099 (2018)","DOI":"10.1109\/CVPR.2018.00223"},{"key":"10_CR140","doi-asserted-by":"crossref","unstructured":"Zhao, K., Chu, W.S., Zhang, H.: Deep region and multi-label learning for facial action unit detection. In: CVPR, pp. 3391\u20133399 (2016)","DOI":"10.1109\/CVPR.2016.369"},{"key":"10_CR141","doi-asserted-by":"crossref","unstructured":"Zhao, S., Li, Y., Yao, X., Nie, W., Xu, P., Yang, J., Keutzer, K.: Emotion-based end-to-end matching between image and music in valence-arousal space. In: ACM MM, pp. 2945\u20132954 (2020)","DOI":"10.1145\/3394171.3413776"},{"key":"10_CR142","first-page":"6544","volume":"30","author":"Z Zhao","year":"2021","unstructured":"Zhao, Z., Liu, Q., Wang, S.: Learning deep global multi-scale and local attention features for facial expression recognition in the wild. IEEE TIP 30, 6544\u20136556 (2021)","journal-title":"IEEE TIP"},{"key":"10_CR143","doi-asserted-by":"crossref","unstructured":"Zhao, Z., Liu, Q., Zhou, F.: Robust lightweight facial expression recognition network with label distribution training. In: AAAI, pp. 3510\u20133519 (2021)","DOI":"10.1609\/aaai.v35i4.16465"},{"key":"10_CR144","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: MiniGPT-4: Enhancing vision-language understanding with advanced large language models (2023). arXiv preprint arXiv:2304.10592"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72649-1_10","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T07:06:55Z","timestamp":1727593615000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72649-1_10"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,30]]},"ISBN":["9783031726484","9783031726491"],"references-count":144,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72649-1_10","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,9,30]]},"assertion":[{"value":"30 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}