{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,21]],"date-time":"2025-11-21T18:12:26Z","timestamp":1763748746362,"version":"3.28.0"},"reference-count":37,"publisher":"IEEE","license":[{"start":{"date-parts":[[2022,10,16]],"date-time":"2022-10-16T00:00:00Z","timestamp":1665878400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,10,16]],"date-time":"2022-10-16T00:00:00Z","timestamp":1665878400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022,10,16]]},"DOI":"10.1109\/icip46576.2022.9897235","type":"proceedings-article","created":{"date-parts":[[2022,11,3]],"date-time":"2022-11-03T17:27:24Z","timestamp":1667496444000},"page":"1346-1350","source":"Crossref","is-referenced-by-count":7,"title":["Learning Contextually Fused Audio-Visual Representations For Audio-Visual Speech Recognition"],"prefix":"10.1109","author":[{"given":"Zi-Qiang","family":"Zhang","sequence":"first","affiliation":[{"name":"University of Science and Technology of China (USTC),Nel-Slip,Hefei,China"}]},{"given":"Jie","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China (USTC),Nel-Slip,Hefei,China"}]},{"given":"Jian-Shu","family":"Zhang","sequence":"additional","affiliation":[{"name":"iFlytek Co., Ltd.,iFlytek Research,Hefei,China"}]},{"given":"Ming-Hui","family":"Wu","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China (USTC),Nel-Slip,Hefei,China"}]},{"given":"Xin","family":"Fang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China (USTC),Nel-Slip,Hefei,China"}]},{"given":"Li-Rong","family":"Dai","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China (USTC),Nel-Slip,Hefei,China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1038\/264746a0"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461326"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2889052"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1145\/3242969.3243014"},{"key":"ref5","first-page":"638","article-title":"Attentive fusion enhanced audio-visual encoding for transformer based robust speech recognition","author":"Wei","year":"2020","journal-title":"APSIPA ASC"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414567"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1016\/j.aiopen.2021.08.002"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_39"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682524"},{"key":"ref10","first-page":"7774","article-title":"Cooperative learning of audio and video models from self-supervised synchronization","author":"Korbar","year":"2018","journal-title":"NIPS"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.73"},{"key":"ref12","first-page":"9758","article-title":"Self-supervised learning by cross-modal audio-video clustering","volume":"33","author":"Alwassel","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"article-title":"Contrastive learning of global and local audio-visual representations","year":"2021","author":"Ma","key":"ref13"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.215"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01229"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01274"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_48"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1360"},{"article-title":"Learning speech representations from raw audio by joint audiovisual self-supervision","year":"2020","author":"Shukla","key":"ref19"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413488"},{"article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","year":"2018","author":"Devlin","key":"ref21"},{"key":"ref22","first-page":"12 449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume":"33","author":"Baevski","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413869"},{"article-title":"Parameter efficient multimodal transformers for video representation learning","year":"2020","author":"Lee","key":"ref25"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053841"},{"article-title":"Learning audio-visual speech representation by masked multimodal cluster prediction","year":"2022","author":"Shi","key":"ref27"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-54184-6_6"},{"article-title":"Lrs3-ted: a large-scale dataset for visual speech recognition","year":"2018","author":"Afouras","key":"ref29"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1873"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref32","first-page":"6000","article-title":"Attention is all you need","author":"Vaswani","year":"2017","journal-title":"NIPS"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-4009"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4899-7687-1_79"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9415063"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00290"}],"event":{"name":"2022 IEEE International Conference on Image Processing (ICIP)","start":{"date-parts":[[2022,10,16]]},"location":"Bordeaux, France","end":{"date-parts":[[2022,10,19]]}},"container-title":["2022 IEEE International Conference on Image Processing (ICIP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9897158\/9897159\/09897235.pdf?arnumber=9897235","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,22]],"date-time":"2024-01-22T16:08:14Z","timestamp":1705939694000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9897235\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,10,16]]},"references-count":37,"URL":"https:\/\/doi.org\/10.1109\/icip46576.2022.9897235","relation":{},"subject":[],"published":{"date-parts":[[2022,10,16]]}}}