{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,1]],"date-time":"2025-12-01T11:29:37Z","timestamp":1764588577887,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":50,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-sa\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680584","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:41Z","timestamp":1729925981000},"page":"10373-10381","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Integrating Content-Semantics-World Knowledge to Detect Stress from Videos"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2992-6758","authenticated-orcid":false,"given":"Yang","family":"Ding","sequence":"first","affiliation":[{"name":"Department of Computer Science and Technology, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1219-2436","authenticated-orcid":false,"given":"Yi","family":"Dai","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-9536-8082","authenticated-orcid":false,"given":"Xin","family":"Wang","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-5397-1361","authenticated-orcid":false,"given":"Ling","family":"Feng","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2778-6870","authenticated-orcid":false,"given":"Lei","family":"Cao","sequence":"additional","affiliation":[{"name":"Faculty of Psychology, Beijing Normal University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3048-4969","authenticated-orcid":false,"given":"Huijun","family":"Zhang","sequence":"additional","affiliation":[{"name":"China Huaneng Clean Energy Research Institute, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00530-022-00984-w"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jad.2014.10.054"},{"key":"e_1_3_2_1_4_1","volume-title":"Marlin: Masked autoencoder for facial video representation learning. In CVPR. 1493--1504.","author":"Zhixi Cai","year":"2023","unstructured":"Zhixi Cai et al. 2023. Marlin: Masked autoencoder for facial video representation learning. In CVPR. 1493--1504."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.2307\/2136404"},{"key":"e_1_3_2_1_6_1","volume-title":"Observer-based measurement of facial expression with the Facial Action Coding System. The handbook of emotion elicitation and assessment","author":"Cohn Jeffrey F","year":"2007","unstructured":"Jeffrey F Cohn, Zara Ambadar, and Paul Ekman. 2007. Observer-based measurement of facial expression with the Facial Action Coding System. The handbook of emotion elicitation and assessment, Vol. 1, 3 (2007), 203--221."},{"key":"e_1_3_2_1_7_1","volume-title":"Cues to deception. Psychological bulletin","author":"DePaulo Bella M","year":"2003","unstructured":"Bella M DePaulo, James J Lindsay, Brian E Malone, Laura Muhlenbruck, Kelly Charlton, and Harris Cooper. 2003. Cues to deception. Psychological bulletin, Vol. 129, 1 (2003), 74."},{"key":"e_1_3_2_1_8_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/JBHI.2023.3283338"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/AFGR.1998.670965"},{"volume-title":"What the face reveals: Basic and applied studies of spontaneous expression using the Facial Action Coding System (FACS)","author":"Ekman Paul","key":"e_1_3_2_1_11_1","unstructured":"Paul Ekman and Erika L Rosenberg. 1997. What the face reveals: Basic and applied studies of spontaneous expression using the Facial Action Coding System (FACS). Oxford University Press, USA."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/AICCSA.2018.8612825"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2014.7026203"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.3390\/s19173693"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1146\/annurev.clinpsy.1.102803.143938"},{"volume-title":"Quantitative Multidimensional Stress Assessment from Facial Videos. Ph.,D. Dissertation","author":"Lin He.","key":"e_1_3_2_1_17_1","unstructured":"Lin He. 2022. Quantitative Multidimensional Stress Assessment from Facial Videos. Ph.,D. Dissertation. Marquette University."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1016\/0022-3999(67)90010-4"},{"key":"e_1_3_2_1_19_1","volume-title":"Artificial Life and Robotics","volume":"25","author":"Iuchi K.","year":"2020","unstructured":"K. Iuchi, R. Mitsuhashi, T. Goto, and et al. 2020. Stress levels estimation from facial video based on non-contact measurement of pulse wave. Artificial Life and Robotics, Vol. 25 (July 2020), 335--342."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.3390\/s21227498"},{"key":"e_1_3_2_1_21_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2021.3057578"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2015.2496320"},{"key":"e_1_3_2_1_24_1","volume-title":"Facial expressions of emotion reveal neuroendocrine and cardiovascular stress responses. Biological psychiatry","author":"Lerner Jennifer S","year":"2007","unstructured":"Jennifer S Lerner, Ronald E Dahl, Ahmad R Hariri, and Shelley E Taylor. 2007. Facial expressions of emotion reveal neuroendocrine and cardiovascular stress responses. Biological psychiatry, Vol. 61, 2 (2007), 253--260."},{"key":"e_1_3_2_1_25_1","volume-title":"Incorporating forthcoming events and personality traits in social media based stress prediction","author":"Li Ningyun","year":"2021","unstructured":"Ningyun Li, Huijun Zhang, and Ling Feng. 2021. Incorporating forthcoming events and personality traits in social media based stress prediction. IEEE Transactions on Affective Computing (2021)."},{"key":"e_1_3_2_1_26_1","volume-title":"A comprehensive evaluation of gpt-4v on knowledge-intensive visual question answering. arXiv preprint arXiv:2311.07536","author":"Li Yunxin","year":"2023","unstructured":"Yunxin Li, Longyue Wang, Baotian Hu, Xinyu Chen, Wanqi Zhong, Chenyang Lyu, and Min Zhang. 2023. A comprehensive evaluation of gpt-4v on knowledge-intensive visual question answering. arXiv preprint arXiv:2311.07536 (2023)."},{"key":"e_1_3_2_1_27_1","volume-title":"GPT-4V with emotion: A zero-shot benchmark for Generalized Emotion Recognition. Information Fusion","author":"Lian Zheng","year":"2024","unstructured":"Zheng Lian, Licai Sun, Haiyang Sun, Kang Chen, Zhuofan Wen, Hao Gu, Bin Liu, and Jianhua Tao. 2024. GPT-4V with emotion: A zero-shot benchmark for Generalized Emotion Recognition. Information Fusion (2024), 102367."},{"key":"e_1_3_2_1_28_1","volume-title":"A survey on hallucination in large vision-language models. arXiv preprint arXiv:2402.00253","author":"Liu Hanchao","year":"2024","unstructured":"Hanchao Liu, Wenyuan Xue, Yifei Chen, Dapeng Chen, Xiutian Zhao, Ke Wang, Liping Hou, Rongjun Li, and Wei Peng. 2024. A survey on hallucination in large vision-language models. arXiv preprint arXiv:2402.00253 (2024)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.3390\/app12010327"},{"key":"e_1_3_2_1_30_1","volume-title":"Performing under pressure: stress and cognitive function. Applied animal behaviour science","author":"Mendl Michael","year":"1999","unstructured":"Michael Mendl. 1999. Performing under pressure: stress and cognitive function. Applied animal behaviour science, Vol. 65, 3 (1999), 221--244."},{"key":"e_1_3_2_1_31_1","volume-title":"Emotional state and the detection of change in facial expression of emotion. European journal of social psychology","author":"Niedenthal Paula M","year":"2000","unstructured":"Paula M Niedenthal, Jamin B Halberstadt, Jonathan Margolin, and \u00c5se H Innes-Ker. 2000. Emotional state and the detection of change in facial expression of emotion. European journal of social psychology, Vol. 30, 2 (2000), 211--222."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.bbe.2019.01.004"},{"key":"e_1_3_2_1_33_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In Proceedings of the 38th International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 139), Marina Meila and Tong Zhang (Eds.). PMLR, 8748--8763. https:\/\/proceedings.mlr.press\/v139\/radford21a.html"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/IWOBI.2015.7160155"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICEET53442.2021.9659697"},{"key":"e_1_3_2_1_36_1","volume-title":"Microprocessors and Microsystems","volume":"95","author":"Astha","year":"2022","unstructured":"Astha Singh et al. 2022. Detection of stress, anxiety and depression (SAD) in video surveillance using ResNet-101. Microprocessors and Microsystems, Vol. 95 (2022)."},{"key":"e_1_3_2_1_37_1","volume-title":"\u0141 ukasz Kaiser, and Illia Polosukhin","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141 ukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems, I. Guyon, U. Von Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett (Eds.), Vol. 30. Curran Associates, Inc. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CBMI.2018.8516497"},{"key":"e_1_3_2_1_39_1","volume-title":"To see is to believe: Prompting gpt-4v for better visual instruction tuning. arXiv preprint arXiv:2311.07574","author":"Wang Junke","year":"2023","unstructured":"Junke Wang, Lingchen Meng, Zejia Weng, Bo He, Zuxuan Wu, and Yu-Gang Jiang. 2023. To see is to believe: Prompting gpt-4v for better visual instruction tuning. arXiv preprint arXiv:2311.07574 (2023)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00693"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3485447.3512013"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413596"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3580305.3599795"},{"key":"e_1_3_2_1_44_1","volume-title":"An early evaluation of gpt-4v (ision). arXiv preprint arXiv:2310.16534","author":"Wu Yang","year":"2023","unstructured":"Yang Wu, Shilong Wang, Hao Yang, Tian Zheng, Hongbo Zhang, Yanyan Zhao, and Bing Qin. 2023. An early evaluation of gpt-4v (ision). arXiv preprint arXiv:2310.16534 (2023)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICISCE.2017.95"},{"key":"e_1_3_2_1_46_1","volume-title":"The dawn of lmms: Preliminary explorations with gpt-4v (ision). arXiv preprint arXiv:2309.17421","author":"Yang Zhengyuan","year":"2023","unstructured":"Zhengyuan Yang, Linjie Li, Kevin Lin, Jianfeng Wang, Chung-Ching Lin, Zicheng Liu, and Lijuan Wang. 2023. The dawn of lmms: Preliminary explorations with gpt-4v (ision). arXiv preprint arXiv:2309.17421, Vol. 9, 1 (2023), 1."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.3390\/s20195552"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/SIPROCESS.2019.8868735"},{"key":"e_1_3_2_1_49_1","unstructured":"Yue Zhang Yafu Li Leyang Cui Deng Cai Lemao Liu Tingchen Fu Xinting Huang Enbo Zhao Yu Zhang Yulong Chen et al. 2023. Siren's song in the AI ocean: a survey on hallucination in large language models. arXiv preprint arXiv:2309.01219 (2023)."},{"key":"e_1_3_2_1_50_1","volume-title":"if grounded. arXiv preprint arXiv:2401.01614","author":"Zheng Boyuan","year":"2024","unstructured":"Boyuan Zheng, Boyu Gou, Jihyung Kil, Huan Sun, and Yu Su. 2024. Gpt-4v (ision) is a generalist web agent, if grounded. arXiv preprint arXiv:2401.01614 (2024)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680584","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680584","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:56Z","timestamp":1750295876000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680584"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":50,"alternative-id":["10.1145\/3664647.3680584","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680584","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}