{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T07:14:50Z","timestamp":1730272490547,"version":"3.28.0"},"reference-count":22,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,5,19]],"date-time":"2024-05-19T00:00:00Z","timestamp":1716076800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,5,19]],"date-time":"2024-05-19T00:00:00Z","timestamp":1716076800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,5,19]]},"DOI":"10.1109\/iscas58744.2024.10558240","type":"proceedings-article","created":{"date-parts":[[2024,7,2]],"date-time":"2024-07-02T17:22:52Z","timestamp":1719940972000},"page":"1-5","source":"Crossref","is-referenced-by-count":0,"title":["Enhancing Cross-Modal Understanding for Audio Visual Scene-Aware Dialog Through Contrastive Learning"],"prefix":"10.1109","author":[{"given":"Feifei","family":"Xu","sequence":"first","affiliation":[{"name":"Shanghai University of Electric Power,Shanghai,China,201306"}]},{"given":"Wang","family":"Zhou","sequence":"additional","affiliation":[{"name":"Shanghai University of Electric Power,Shanghai,China,201306"}]},{"given":"Guangzhen","family":"Li","sequence":"additional","affiliation":[{"name":"Shanghai University of Electric Power,Shanghai,China,201306"}]},{"given":"Zheng","family":"Zhong","sequence":"additional","affiliation":[{"name":"Shanghai University of Electric Power,Shanghai,China,201306"}]},{"given":"Yingchen","family":"Zhou","sequence":"additional","affiliation":[{"name":"Shanghai University of Electric Power,Shanghai,China,201306"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00774"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3065823"},{"article-title":"Audio visual scene-aware dialog (avsd) challenge at dstc7[J]","year":"2018","author":"H","key":"ref3"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/taslp.2021.3078368"},{"article-title":"From film to video: Multi-turn question answering with multi-modal context","year":"2018","author":"Nguyen","key":"ref5"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.247"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746481"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.518"},{"article-title":"Learning reasoning paths over semantic graphs for video-grounded dialogues","year":"2021","author":"Le","key":"ref9"},{"issue":"8","key":"ref10","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI blog"},{"article-title":"Google\u2019s neural machine translation system: Bridging the gap between human and machine translation","year":"2016","author":"Wu","key":"ref11"},{"article-title":"Multi-step joint-modality attention network for scene-aware dialogue system","year":"2020","author":"Chu","key":"ref12"},{"article-title":"Audio visual scene-aware dialog system using dynamic memory networks","volume-title":"DSTC8 at AAAI2020 workshop","author":"Xie","key":"ref13"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1007\/s10590-009-9059-4"},{"key":"ref16","article-title":"Rouge: A package for automatic evaluation of sum-maries","author":"Lin","year":"2004","journal-title":"Text summarization branches out."},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1564"},{"issue":"11","key":"ref19","first-page":"13618","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","volume":"37","author":"W"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_31"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747769"},{"article-title":"Audio visual scene-aware dialog generation with transformer-based video representations","year":"2022","author":"Yamazaki","key":"ref22"}],"event":{"name":"2024 IEEE International Symposium on Circuits and Systems (ISCAS)","start":{"date-parts":[[2024,5,19]]},"location":"Singapore, Singapore","end":{"date-parts":[[2024,5,22]]}},"container-title":["2024 IEEE International Symposium on Circuits and Systems (ISCAS)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10557746\/10557828\/10558240.pdf?arnumber=10558240","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,7,3]],"date-time":"2024-07-03T07:53:42Z","timestamp":1719993222000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10558240\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,19]]},"references-count":22,"URL":"https:\/\/doi.org\/10.1109\/iscas58744.2024.10558240","relation":{},"subject":[],"published":{"date-parts":[[2024,5,19]]}}}