{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T07:08:59Z","timestamp":1761894539518,"version":"build-2065373602"},"reference-count":41,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1109\/icme59968.2025.11208908","type":"proceedings-article","created":{"date-parts":[[2025,10,30]],"date-time":"2025-10-30T17:57:42Z","timestamp":1761847062000},"page":"1-6","source":"Crossref","is-referenced-by-count":0,"title":["Multimodal Representation Learning Techniques for Comprehensive Facial State Analysis"],"prefix":"10.1109","author":[{"given":"Kaiwen","family":"Zheng","sequence":"first","affiliation":[{"name":"University of Glasgow,School of Computing Science,Glasgow,United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xuri","family":"Ge","sequence":"additional","affiliation":[{"name":"Shandong University,School of Artificial Intelligence,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Junchen","family":"Fu","sequence":"additional","affiliation":[{"name":"University of Glasgow,School of Computing Science,Glasgow,United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jun","family":"Peng","sequence":"additional","affiliation":[{"name":"Peng Cheng Laboratory,Shenzhen,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Joemon M.","family":"Jose","sequence":"additional","affiliation":[{"name":"University of Glasgow,School of Computing Science,Glasgow,United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"doi-asserted-by":"publisher","key":"ref1","DOI":"10.1109\/CVPRW53098.2021.00173"},{"doi-asserted-by":"publisher","key":"ref2","DOI":"10.1109\/ACCESS.2020.3005687"},{"doi-asserted-by":"publisher","key":"ref3","DOI":"10.1109\/CVPR52688.2022.01814"},{"doi-asserted-by":"publisher","key":"ref4","DOI":"10.1109\/AICCSA.2017.124"},{"doi-asserted-by":"publisher","key":"ref5","DOI":"10.1007\/s00371-019-01707-5"},{"doi-asserted-by":"publisher","key":"ref6","DOI":"10.1109\/FG52635.2021.9666961"},{"doi-asserted-by":"publisher","key":"ref7","DOI":"10.1109\/TBIOM.2023.3306810"},{"doi-asserted-by":"publisher","key":"ref8","DOI":"10.1109\/IGSC48788.2019.8957163"},{"doi-asserted-by":"publisher","key":"ref9","DOI":"10.3390\/s18020416"},{"doi-asserted-by":"publisher","key":"ref10","DOI":"10.1109\/TMI.2019.2913158"},{"doi-asserted-by":"publisher","key":"ref11","DOI":"10.1016\/j.bspc.2023.104744"},{"year":"2022","author":"Yu","article-title":"Coca: Contrastive captioners are image-text foundation models","key":"ref12"},{"year":"2022","author":"Li","article-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation","key":"ref13"},{"volume-title":"BMVC 2023","author":"Yuan","article-title":"Describe your facial expressions by linking image encoders and large language models","key":"ref14"},{"doi-asserted-by":"publisher","key":"ref15","DOI":"10.1145\/3664647.3681443"},{"doi-asserted-by":"publisher","key":"ref16","DOI":"10.1016\/j.iotcps.2023.04.003"},{"doi-asserted-by":"publisher","key":"ref17","DOI":"10.1109\/CVPRW59228.2023.00626"},{"doi-asserted-by":"publisher","key":"ref18","DOI":"10.1109\/TAFFC.2017.2740923"},{"doi-asserted-by":"publisher","key":"ref19","DOI":"10.1109\/TIP.2018.2868382"},{"doi-asserted-by":"publisher","key":"ref20","DOI":"10.1145\/3394171.3413620"},{"doi-asserted-by":"publisher","key":"ref21","DOI":"10.1109\/T-AFFC.2013.4"},{"doi-asserted-by":"publisher","key":"ref22","DOI":"10.1109\/CVPR52688.2022.02025"},{"doi-asserted-by":"publisher","key":"ref23","DOI":"10.1109\/ICCVW.2011.6130508"},{"doi-asserted-by":"publisher","key":"ref24","DOI":"10.1016\/j.imavis.2017.02.001"},{"doi-asserted-by":"publisher","key":"ref25","DOI":"10.1109\/FG.2017.144"},{"key":"ref26","doi-asserted-by":"crossref","DOI":"10.1007\/978-3-030-69544-6_5","article-title":"Raf-au database: In-the-wild facial expressions with subjective emotion judgement and objective au annotations","volume-title":"ACCV","author":"Yan"},{"doi-asserted-by":"publisher","key":"ref27","DOI":"10.1109\/CVPRW.2010.5543262"},{"doi-asserted-by":"publisher","key":"ref28","DOI":"10.1109\/CVPR.2016.600"},{"doi-asserted-by":"publisher","key":"ref29","DOI":"10.1007\/978-3-319-39513-5_5"},{"doi-asserted-by":"publisher","key":"ref30","DOI":"10.1016\/j.imavis.2014.06.002"},{"year":"2023","author":"Li","article-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","key":"ref31"},{"year":"2021","author":"Dosovitskiy","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","key":"ref32"},{"year":"2019","author":"Devlin","article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","key":"ref33"},{"doi-asserted-by":"publisher","key":"ref34","DOI":"10.1109\/ICCV.2017.116"},{"year":"2023","author":"Mao","article-title":"Cross-entropy loss functions: Theoretical analysis and applications","key":"ref35"},{"doi-asserted-by":"publisher","key":"ref36","DOI":"10.24963\/ijcai.2022\/173"},{"doi-asserted-by":"publisher","key":"ref37","DOI":"10.1109\/tkde.2025.3608071"},{"doi-asserted-by":"publisher","key":"ref38","DOI":"10.1145\/3626772.3657725"},{"doi-asserted-by":"publisher","key":"ref39","DOI":"10.1145\/3616855.3635805"},{"key":"ref40","first-page":"2790","article-title":"Parameter-efficient transfer learning for nlp","volume-title":"International conference on machine learning","author":"Houlsby"},{"doi-asserted-by":"publisher","key":"ref41","DOI":"10.1016\/j.simpa.2022.100433"}],"event":{"name":"2025 IEEE International Conference on Multimedia and Expo (ICME)","start":{"date-parts":[[2025,6,30]]},"location":"Nantes, France","end":{"date-parts":[[2025,7,4]]}},"container-title":["2025 IEEE International Conference on Multimedia and Expo (ICME)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11208895\/11208897\/11208908.pdf?arnumber=11208908","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T05:30:08Z","timestamp":1761888608000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11208908\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":41,"URL":"https:\/\/doi.org\/10.1109\/icme59968.2025.11208908","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]}}}