{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,3]],"date-time":"2026-06-03T14:07:08Z","timestamp":1780495628219,"version":"3.54.1"},"reference-count":44,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62206219"],"award-info":[{"award-number":["62206219"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62376215"],"award-info":[{"award-number":["62376215"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Pattern Recognition"],"published-print":{"date-parts":[[2026,11]]},"DOI":"10.1016\/j.patcog.2026.113544","type":"journal-article","created":{"date-parts":[[2026,3,18]],"date-time":"2026-03-18T09:20:57Z","timestamp":1773825657000},"page":"113544","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"PA","title":["LLM-driven fine-grained emotion parsing and parameterized mapping for conversational TTS"],"prefix":"10.1016","volume":"179","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8675-3980","authenticated-orcid":false,"given":"Xiaochun","family":"An","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-4927-9444","authenticated-orcid":false,"given":"Xu","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7046-7787","authenticated-orcid":false,"given":"Xiaoge","family":"Li","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3582-6809","authenticated-orcid":false,"given":"Ercheng","family":"Pei","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0689-145X","authenticated-orcid":false,"given":"Qingli","family":"Yan","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2515-8579","authenticated-orcid":false,"given":"Lang","family":"He","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.patcog.2026.113544_bib0001","series-title":"Proceedings of 4th International Conference on Sustainable Expert Systems (ICSES)","first-page":"1715","article-title":"SPEAR: Design and implementation of an advanced virtual assistant","author":"Jain","year":"2024"},{"key":"10.1016\/j.patcog.2026.113544_bib0002","first-page":"1","article-title":"Neural codec language models are zero-shot text to speech synthesizers","author":"Wang","year":"2025","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.patcog.2026.113544_bib0003","series-title":"Proceedings of the IEEE Spoken Language Technology Workshop (SLT)","first-page":"403","article-title":"Conversational end-to-end TTS for voice agents","author":"Guo","year":"2021"},{"key":"10.1016\/j.patcog.2026.113544_bib0004","series-title":"Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"10706","article-title":"CONCSS: Contrastive-based context comprehension for dialogue-appropriate prosody in conversational speech synthesis","author":"Deng","year":"2024"},{"key":"10.1016\/j.patcog.2026.113544_bib0005","doi-asserted-by":"crossref","first-page":"2654","DOI":"10.1109\/TASLP.2024.3395994","article-title":"Diffprosody: diffusion-based latent prosody generation for expressive speech synthesis with prosody conditional adversarial training","volume":"32","author":"Oh","year":"2024","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.patcog.2026.113544_bib0006","doi-asserted-by":"crossref","first-page":"2365","DOI":"10.1109\/TAFFC.2025.3561267","article-title":"EmoSphere++: Emotion-controllable zero-shot text-to-speech via emotion-adaptive spherical vector","volume":"16","author":"Cho","year":"2025","journal-title":"IEEE Trans. Affect. Comput."},{"key":"10.1016\/j.patcog.2026.113544_bib0007","series-title":"Proceedings of the Computer Vision and Pattern Recognition Conference","first-page":"15863","article-title":"EmoDubber: Towards high quality and emotion controllable movie dubbing","author":"Cong","year":"2025"},{"issue":"3","key":"10.1016\/j.patcog.2026.113544_bib0008","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3641289","article-title":"A survey on evaluation of large language models","volume":"15","author":"Chang","year":"2024","journal-title":"ACM Trans. Intell. Syst. Technol."},{"key":"10.1016\/j.patcog.2026.113544_bib0009","series-title":"Proceedings of the IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","first-page":"1","article-title":"SLM: Bridge the thin gap between speech and text foundation models","author":"Wang","year":"2023"},{"key":"10.1016\/j.patcog.2026.113544_bib0010","series-title":"Proceedings of the Annual Meeting of the Association for Computational Linguistics (ACL)","first-page":"6626","article-title":"Advancing large language models to capture varied speaking styles and respond properly in spoken conversations","author":"Lin","year":"2024"},{"key":"10.1016\/j.patcog.2026.113544_bib0011","series-title":"Proceedings of the ACM International Conference on Multimedia (ACM MM)","first-page":"4187","article-title":"Generative expressive conversational speech synthesis","author":"Liu","year":"2024"},{"key":"10.1016\/j.patcog.2026.113544_bib0012","series-title":"Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP)","first-page":"8863","article-title":"Multimodal fine-grained context interaction graph modeling for conversational speech synthesis","author":"Jia","year":"2025"},{"key":"10.1016\/j.patcog.2026.113544_bib0013","series-title":"Proceedings of the IEEE International Symposium on Chinese Spoken Language Processing (ISCSLP)","first-page":"586","article-title":"E-chat: Emotion-sensitive spoken dialogue system with large language models","author":"Xue","year":"2024"},{"key":"10.1016\/j.patcog.2026.113544_bib0014","series-title":"Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"1","article-title":"JELLY: Joint emotion recognition and context reasoning with LLMs for conversational speech synthesis","author":"Cha","year":"2025"},{"key":"10.1016\/j.patcog.2026.113544_bib0015","series-title":"Proceedings of the Findings of the Association for Computational Linguistics (ACL)","first-page":"1988","article-title":"Chain-Talker: Chain understanding and rendering for empathetic conversational speech synthesis","author":"Hu","year":"2025"},{"key":"10.1016\/j.patcog.2026.113544_bib0016","series-title":"Proceedings of the 14th Learning Analytics and Knowledge Conference","first-page":"518","article-title":"Prompt-based and fine-tuned GPT models for context-dependent and-independent deductive coding in social annotation","author":"Hou","year":"2024"},{"issue":"3","key":"10.1016\/j.patcog.2026.113544_bib0017","doi-asserted-by":"crossref","first-page":"50","DOI":"10.3390\/biomedinformatics5030050","article-title":"Using large language models to extract structured data from health coaching dialogues: A comparative study of code generation versus direct information extraction","volume":"5","author":"Kanduri","year":"2025","journal-title":"BioMedInformatics"},{"issue":"2","key":"10.1016\/j.patcog.2026.113544_bib0018","doi-asserted-by":"crossref","first-page":"195","DOI":"10.1007\/s10489-024-05908-x","article-title":"LMTformer: Facial depression recognition with lightweight multi-scale transformer from videos","volume":"55","author":"He","year":"2025","journal-title":"Appl. Intell."},{"key":"10.1016\/j.patcog.2026.113544_bib0019","doi-asserted-by":"crossref","DOI":"10.1016\/j.bspc.2024.106767","article-title":"LSCAformer: Long and short-term cross-attention-aware transformer for depression recognition from video sequences","volume":"98","author":"He","year":"2024","journal-title":"Biomed. Signal Process. Control."},{"key":"10.1016\/j.patcog.2026.113544_bib0020","doi-asserted-by":"crossref","DOI":"10.1016\/j.bspc.2024.106490","article-title":"Depressformer: Leveraging video swin transformer and fine-grained local features for depression scale estimation","volume":"96","author":"He","year":"2024","journal-title":"Biomed. Signal Process. Control."},{"key":"10.1016\/j.patcog.2026.113544_bib0021","doi-asserted-by":"crossref","first-page":"120","DOI":"10.1016\/j.neunet.2022.05.025","article-title":"Reducing noisy annotations for depression estimation from facial images","volume":"153","author":"He","year":"2022","journal-title":"Neural Netw."},{"issue":"10","key":"10.1016\/j.patcog.2026.113544_bib0022","doi-asserted-by":"crossref","first-page":"1159","DOI":"10.1109\/JPROC.2023.3309299","article-title":"Toward label-efficient emotion and sentiment analysis","volume":"111","author":"Zhao","year":"2023","journal-title":"Proc. IEEE"},{"key":"10.1016\/j.patcog.2026.113544_bib0023","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2026.113087","article-title":"Is multimodal conversational emotion recognition satisfactory? Exploring the gaps in performance, generalization, and confidence","author":"Tu","year":"2026","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.113544_bib0024","series-title":"Proceedings of the 5th International Conference on Speech Prosody (SP5)","first-page":"1","article-title":"Improving TTS synthesis for emotional expressivity by a prosodic parameterization of affect based on linguistic analysis","author":"Shaikh","year":"2010"},{"key":"10.1016\/j.patcog.2026.113544_bib0025","doi-asserted-by":"crossref","first-page":"1448","DOI":"10.1109\/TASLP.2022.3164181","article-title":"Cross-speaker emotion disentangling and transfer for end-to-end speech synthesis","volume":"30","author":"Li","year":"2022","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.patcog.2026.113544_bib0026","series-title":"Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"10601","article-title":"Hierarchical emotion prediction and control in text-to-speech synthesis","author":"Inoue","year":"2024"},{"issue":"6","key":"10.1016\/j.patcog.2026.113544_bib0027","doi-asserted-by":"crossref","first-page":"4234","DOI":"10.1109\/TPAMI.2024.3356232","article-title":"Naturalspeech: End-to-end text-to-speech synthesis with human-level quality","volume":"46","author":"Tan","year":"2024","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.patcog.2026.113544_bib0028","series-title":"Proceedings of the ACM International Conference on Multimedia (ACM MM)","first-page":"6081","article-title":"CMCU-CSS: Enhancing naturalness via commonsense-based multi-modal context understanding in conversational speech synthesis","author":"Deng","year":"2023"},{"issue":"2","key":"10.1016\/j.patcog.2026.113544_bib0029","doi-asserted-by":"crossref","first-page":"2635","DOI":"10.1109\/TCSVT.2025.3609776","article-title":"FedDAAM: Federated domain adversarial learning with attention mechanism for privacy preserving multimodal depression assessment","volume":"36","author":"He","year":"2026","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.patcog.2026.113544_bib0030","doi-asserted-by":"crossref","DOI":"10.1016\/j.inffus.2025.103632","article-title":"LMVD: A large-scale multimodal vlog dataset for depression detection in the wild","volume":"126","author":"He","year":"2026","journal-title":"Inf. Fusion"},{"key":"10.1016\/j.patcog.2026.113544_bib0031","doi-asserted-by":"crossref","DOI":"10.1016\/j.inffus.2026.104213","article-title":"FDA-CAPMA: Federated domain adaptation with co-activation pattern and multimodal mamba for fMRI depression detection","volume":"132","author":"He","year":"2026","journal-title":"Inf. Fusion"},{"key":"10.1016\/j.patcog.2026.113544_bib0032","article-title":"Generalizable large language model based human keypoint localization for emotion recognition","author":"Li","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.113544_bib0033","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)","first-page":"18698","article-title":"Emotion rendering for conversational speech synthesis with heterogeneous graph-based context modeling","volume":"38","author":"Liu","year":"2024"},{"key":"10.1016\/j.patcog.2026.113544_bib0034","doi-asserted-by":"crossref","first-page":"1506","DOI":"10.1109\/TASLP.2024.3363444","article-title":"METTS: Multilingual emotional text-to-speech by cross-speaker and cross-lingual emotion transfer","volume":"32","author":"Zhu","year":"2024","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.patcog.2026.113544_bib0035","series-title":"Proceedings of the IEEE 19th International Conference on Application of Information and Communication Technologies (AICT)","first-page":"1","article-title":"Multimodal LLMs for emotion-aware human\u2013robot interaction: Design and implementation","author":"Alneyadi","year":"2025"},{"key":"10.1016\/j.patcog.2026.113544_bib0036","series-title":"Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"1","article-title":"Intra-and inter-modal context interaction modeling for conversational speech synthesis","author":"Jia","year":"2025"},{"issue":"10","key":"10.1016\/j.patcog.2026.113544_bib0037","doi-asserted-by":"crossref","first-page":"6729","DOI":"10.1109\/TPAMI.2021.3094362","article-title":"Affective image content analysis: Two decades review and new perspectives","volume":"44","author":"Zhao","year":"2021","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"4","key":"10.1016\/j.patcog.2026.113544_bib0038","doi-asserted-by":"crossref","first-page":"526","DOI":"10.1109\/TAFFC.2016.2628787","article-title":"Predicting personalized image emotion perceptions in social networks","volume":"9","author":"Zhao","year":"2016","journal-title":"IEEE Trans. Affect. Comput."},{"issue":"3","key":"10.1016\/j.patcog.2026.113544_bib0039","doi-asserted-by":"crossref","first-page":"1802","DOI":"10.1109\/TAFFC.2025.3539225","article-title":"SDRS: Sentiment-aware disentangled representation shifting for multimodal sentiment analysis","volume":"16","author":"Zhao","year":"2025","journal-title":"IEEE Trans. Affect. Comput."},{"key":"10.1016\/j.patcog.2026.113544_bib0040","series-title":"Proceedings of the Conference of the International Speech Communication Association (INTERSPEECH)","first-page":"4978","article-title":"XTTS: A massively multilingual zero-shot text-to-speech model","author":"Casanova","year":"2024"},{"key":"10.1016\/j.patcog.2026.113544_bib0041","series-title":"Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL)","first-page":"622","article-title":"ALoRA: Allocating low-rank adaptation for fine-tuning large language models","author":"Liu","year":"2024"},{"key":"10.1016\/j.patcog.2026.113544_bib0042","series-title":"Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (ACL)","first-page":"16334","article-title":"Let\u2019s go real talk: Spoken dialogue model for face-to-face conversation","author":"Park","year":"2024"},{"key":"10.1016\/j.patcog.2026.113544_bib0043","series-title":"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics (ACL)","first-page":"527","article-title":"MELD: A multimodal multi-party dataset for emotion recognition in conversations","author":"Poria","year":"2019"},{"issue":"2","key":"10.1016\/j.patcog.2026.113544_bib0044","first-page":"103","article-title":"Emotion detection via BERT-based deep learning approaches in natural language processing","volume":"9","author":"Aslan","year":"2024","journal-title":"Int. J. Energy Eng. Sci."}],"container-title":["Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320326005108?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320326005108?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,3]],"date-time":"2026-06-03T13:09:56Z","timestamp":1780492196000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0031320326005108"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,11]]},"references-count":44,"alternative-id":["S0031320326005108"],"URL":"https:\/\/doi.org\/10.1016\/j.patcog.2026.113544","relation":{},"ISSN":["0031-3203"],"issn-type":[{"value":"0031-3203","type":"print"}],"subject":[],"published":{"date-parts":[[2026,11]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"LLM-driven fine-grained emotion parsing and parameterized mapping for conversational TTS","name":"articletitle","label":"Article Title"},{"value":"Pattern Recognition","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.patcog.2026.113544","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"113544"}}