{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,2]],"date-time":"2026-06-02T14:22:23Z","timestamp":1780410143303,"version":"3.54.1"},"reference-count":43,"publisher":"IEEE","license":[{"start":{"date-parts":[[2021,12,15]],"date-time":"2021-12-15T00:00:00Z","timestamp":1639526400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,12,15]],"date-time":"2021-12-15T00:00:00Z","timestamp":1639526400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100000001","name":"NSF","doi-asserted-by":"publisher","award":["CNS-1629898"],"award-info":[{"award-number":["CNS-1629898"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,12,15]]},"DOI":"10.1109\/fg52635.2021.9667030","type":"proceedings-article","created":{"date-parts":[[2022,1,26]],"date-time":"2022-01-26T05:34:23Z","timestamp":1643175263000},"page":"1-8","source":"Crossref","is-referenced-by-count":10,"title":["Multi-Modal Learning for AU Detection Based on Multi-Head Fused Transformers"],"prefix":"10.1109","author":[{"given":"Xiang","family":"Zhang","sequence":"first","affiliation":[{"name":"State University of New York,Department of Computer Science,Binghamton,NY,USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Lijun","family":"Yin","sequence":"additional","affiliation":[{"name":"State University of New York,Department of Computer Science,Binghamton,NY,USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2014.06.002"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00391"},{"key":"ref33","first-page":"5998","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1656"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_43"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/3347320.3357688"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01034"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413538"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00583"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/T-AFFC.2012.32"},{"key":"ref10","article-title":"An image is worth 16&#x00D7;16 words: Transformers for image recognition at scale","author":"dosovitskiy","year":"2020","journal-title":"International Conference on Learning Representations"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.374"},{"key":"ref11","author":"ekman","year":"1997","journal-title":"What the Face Reveals Basic and Applied Studies of Spontaneous Expression using the Facial Action Coding System (FACS)"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1016\/j.media.2018.07.002"},{"key":"ref14","article-title":"Multimodal unsupervised image-to-image translation","author":"huang","year":"2018","journal-title":"Proceedings of the European Conference on Computer Vision (ECCV)"},{"key":"ref15","first-page":"88","article-title":"Spatiotemporal analysis of rgb-dt facial images for multimodal pain level recognition","author":"irani","year":"2015","journal-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1007\/s12193-015-0195-2"},{"key":"ref17","first-page":"1097","article-title":"Imagenet classification with deep convolutional neural networks","volume":"25","author":"krizhevsky","year":"2012","journal-title":"Advances in neural information processing systems"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/FG.2019.8756629"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018594"},{"key":"ref28","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","author":"raffel","year":"2019","journal-title":"ArXiv Preprint"},{"key":"ref4","article-title":"Language models are few-shot learners","author":"brown","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00700"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2798607"},{"key":"ref6","first-page":"213","article-title":"End-to-end object detection with transformers","author":"carion","year":"2020","journal-title":"European Conference on Computer Vision"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2017.2738401"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2019.8793868"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01258-8_19"},{"key":"ref2","article-title":"Layer normalization","author":"ba","year":"2016","journal-title":"ArXiv Preprint"},{"key":"ref9","article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"2019","journal-title":"NAACL-HLT"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00126"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018594"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2017.2713408"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2015.07.005"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.369"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2019.00235"},{"key":"ref41","first-page":"226","article-title":"Identity-based adversarial training of deep cnns for facial action unit recognition","author":"zhang","year":"2018","journal-title":"BMVC"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/FG.2017.136"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01219"},{"key":"ref43","article-title":"Deformable detr: Deformable transformers for end-to-end object detection","author":"zhu","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2015.2461544"}],"event":{"name":"2021 16th IEEE International Conference on Automatic Face and Gesture Recognition (FG 2021)","location":"Jodhpur, India","start":{"date-parts":[[2021,12,15]]},"end":{"date-parts":[[2021,12,18]]}},"container-title":["2021 16th IEEE International Conference on Automatic Face and Gesture Recognition (FG 2021)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9666787\/9666788\/09667030.pdf?arnumber=9667030","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,8,2]],"date-time":"2022-08-02T23:31:39Z","timestamp":1659483099000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9667030\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,12,15]]},"references-count":43,"URL":"https:\/\/doi.org\/10.1109\/fg52635.2021.9667030","relation":{},"subject":[],"published":{"date-parts":[[2021,12,15]]}}}