{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,14]],"date-time":"2025-10-14T23:01:01Z","timestamp":1760482861566,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,10,10]],"date-time":"2022-10-10T00:00:00Z","timestamp":1665360000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,10,10]]},"DOI":"10.1145\/3503161.3548080","type":"proceedings-article","created":{"date-parts":[[2022,10,10]],"date-time":"2022-10-10T15:42:46Z","timestamp":1665416566000},"page":"6511-6520","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Extreme-scale Talking-Face Video Upsampling with Audio-Visual Priors"],"prefix":"10.1145","author":[{"given":"Sindhu B.","family":"Hegde","sequence":"first","affiliation":[{"name":"International Institute of Information Technology Hyderabad, Hyderabad, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rudrabha","family":"Mukhopadhyay","sequence":"additional","affiliation":[{"name":"International Institute of Information Technology Hyderabad, Hyderabad, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Vinay P.","family":"Namboodiri","sequence":"additional","affiliation":[{"name":"University of Bath, Bath, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"C.V.","family":"Jawahar","sequence":"additional","affiliation":[{"name":"International Institute of Information Technology Hyderabad, Hyderabad, India"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2022,10,10]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"The Conversation: Deep Audio-Visual Speech Enhancement. In INTERSPEECH.","author":"Afouras T.","year":"2018","unstructured":"T. Afouras , J. S. Chung , and A. Zisserman . 2018 . The Conversation: Deep Audio-Visual Speech Enhancement. In INTERSPEECH. T. Afouras, J. S. Chung, and A. Zisserman. 2018. The Conversation: Deep Audio-Visual Speech Enhancement. In INTERSPEECH."},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_2_1","DOI":"10.1007\/978-3-030-58523-5_13"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_3_1","DOI":"10.1109\/CVPR42600.2020.00853"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_4_1","DOI":"10.1109\/ICCV.2017.116"},{"key":"e_1_3_2_2_5_1","volume-title":"Proceedings of the European Conference on Computer Vision (ECCV). 185--200","author":"Bulat Adrian","year":"2018","unstructured":"Adrian Bulat , Jing Yang , and Georgios Tzimiropoulos . 2018 . To learn image superresolution, use a GAN to learn how to do image degradation first . In Proceedings of the European Conference on Computer Vision (ECCV). 185--200 . Adrian Bulat, Jing Yang, and Georgios Tzimiropoulos. 2018. To learn image superresolution, use a GAN to learn how to do image degradation first. In Proceedings of the European Conference on Computer Vision (ECCV). 185--200."},{"key":"e_1_3_2_2_6_1","article-title":"Learning Spatial Attention for Face Super-Resolution","author":"Chen Chaofeng","year":"2020","unstructured":"Chaofeng Chen , Dihong Gong , Hao Wang , Zhifeng Li , and Kwan-Yee K. Wong . 2020 . Learning Spatial Attention for Face Super-Resolution . IEEE Transactions on Image Processing (TIP). Chaofeng Chen, Dihong Gong, Hao Wang, Zhifeng Li, and Kwan-Yee K. Wong. 2020. Learning Spatial Attention for Face Super-Resolution. IEEE Transactions on Image Processing (TIP).","journal-title":"IEEE Transactions on Image Processing (TIP)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_7_1","DOI":"10.1145\/3386569.3392457"},{"doi-asserted-by":"crossref","unstructured":"J. S. Chung A. Nagrani and A. Zisserman. 2018. VoxCeleb2: Deep Speaker Recognition. In INTERSPEECH.  J. S. Chung A. Nagrani and A. Zisserman. 2018. VoxCeleb2: Deep Speaker Recognition. In INTERSPEECH.","key":"e_1_3_2_2_8_1","DOI":"10.21437\/Interspeech.2018-1929"},{"volume-title":"Workshop on Multi-view Lip-reading, ACCV.","author":"Chung J. S.","unstructured":"J. S. Chung and A. Zisserman . 2016. Out of time: automated lip sync in the wild . In Workshop on Multi-view Lip-reading, ACCV. J. S. Chung and A. Zisserman. 2016. Out of time: automated lip sync in the wild. In Workshop on Multi-view Lip-reading, ACCV.","key":"e_1_3_2_2_9_1"},{"key":"e_1_3_2_2_10_1","volume-title":"Perfect Match: Improved Crossmodal Embeddings for Audio-visual Synchronisation. In ICASSP 2019 - 2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 3965--3969","author":"Chung S.","year":"2019","unstructured":"S. Chung , J. S. Chung , and H. Kang . 2019 . Perfect Match: Improved Crossmodal Embeddings for Audio-visual Synchronisation. In ICASSP 2019 - 2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 3965--3969 . https:\/\/doi.org\/10.1109\/ICASSP. 2019 .8682524 S. Chung, J. S. Chung, and H. Kang. 2019. Perfect Match: Improved Crossmodal Embeddings for Audio-visual Synchronisation. In ICASSP 2019 - 2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 3965--3969. https:\/\/doi.org\/10.1109\/ICASSP.2019.8682524"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_11_1","DOI":"10.1145\/3388770.3407417"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_12_1","DOI":"10.1109\/WACV51458.2022.00127"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_13_1","DOI":"10.1109\/TPAMI.2015.2439281"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_14_1","DOI":"10.1109\/ICASSP.2018.8462263"},{"key":"e_1_3_2_2_15_1","volume-title":"Proceedings Ninth IEEE International Conference on Computer Vision. 726--733","volume":"2","year":"2003","unstructured":"Efros, Berg, Mori, and Malik. 2003 . Recognizing action at a distance . In Proceedings Ninth IEEE International Conference on Computer Vision. 726--733 vol. 2 . https:\/\/doi.org\/10.1109\/ICCV.2003.1238420 Efros, Berg, Mori, and Malik. 2003. Recognizing action at a distance. In Proceedings Ninth IEEE International Conference on Computer Vision. 726--733 vol.2. https:\/\/doi.org\/10.1109\/ICCV.2003.1238420"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_16_1","DOI":"10.1145\/3197517.3201357"},{"key":"e_1_3_2_2_17_1","article-title":"Text-Based Editing of Talking-Head Video","volume":"68","author":"Fried Ohad","year":"2019","unstructured":"Ohad Fried , Ayush Tewari , Michael Zollh\u00f6fer , Adam Finkelstein , Eli Shechtman , Dan B Goldman , Kyle Genova , Zeyu Jin , Christian Theobalt , and Maneesh Agrawala . 2019 . Text-Based Editing of Talking-Head Video . ACM Trans. Graph. 38, Article 68 (2019), 14 pages. https:\/\/doi.org\/10.1145\/3306346.3323028 Ohad Fried, Ayush Tewari, Michael Zollh\u00f6fer, Adam Finkelstein, Eli Shechtman, Dan B Goldman, Kyle Genova, Zeyu Jin, Christian Theobalt, and Maneesh Agrawala. 2019. Text-Based Editing of Talking-Head Video. ACM Trans. Graph. 38, Article 68 (2019), 14 pages. https:\/\/doi.org\/10.1145\/3306346.3323028","journal-title":"ACM Trans. Graph. 38, Article"},{"key":"e_1_3_2_2_18_1","volume-title":"Deep Back-Projection Networks for Super-Resolution. In 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 1664--1673","author":"Haris Muhammad","year":"2018","unstructured":"Muhammad Haris , Greg Shakhnarovich , and Norimichi Ukita . 2018 . Deep Back-Projection Networks for Super-Resolution. In 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 1664--1673 . https:\/\/doi.org\/10.1109\/CVPR.2018.00179 Muhammad Haris, Greg Shakhnarovich, and Norimichi Ukita. 2018. Deep Back-Projection Networks for Super-Resolution. In 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 1664--1673. https:\/\/doi.org\/10.1109\/CVPR.2018.00179"},{"key":"e_1_3_2_2_19_1","volume-title":"Recurrent Back-Projection Network for Video Super-Resolution. In IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","author":"Haris Muhammad","year":"2019","unstructured":"Muhammad Haris , Greg Shakhnarovich , and Norimichi Ukita . 2019 . Recurrent Back-Projection Network for Video Super-Resolution. In IEEE Conference on Computer Vision and Pattern Recognition (CVPR). Muhammad Haris, Greg Shakhnarovich, and Norimichi Ukita. 2019. Recurrent Back-Projection Network for Video Super-Resolution. In IEEE Conference on Computer Vision and Pattern Recognition (CVPR)."},{"key":"e_1_3_2_2_20_1","volume-title":"Estimation of Super-Resolved Video Dynamics. CoRR abs\/1506.00473","author":"H\u00e9as Patrick","year":"2015","unstructured":"Patrick H\u00e9as , Ang\u00e9lique Dr\u00e9meau , and C\u00e9dric Herzet . 2015. Estimation of Super-Resolved Video Dynamics. CoRR abs\/1506.00473 ( 2015 ). arXiv:1506.00473 Patrick H\u00e9as, Ang\u00e9lique Dr\u00e9meau, and C\u00e9dric Herzet. 2015. Estimation of Super-Resolved Video Dynamics. CoRR abs\/1506.00473 (2015). arXiv:1506.00473"},{"volume-title":"Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV). 1926--1935","author":"Hegde Sindhu B.","unstructured":"Sindhu B. Hegde , K.R. Prajwal , Rudrabha Mukhopadhyay , Vinay P. Namboodiri , and C.V. Jawahar . 2021. Visual Speech Enhancement Without a Real Visual Stream . In Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV). 1926--1935 . Sindhu B. Hegde, K.R. Prajwal, Rudrabha Mukhopadhyay, Vinay P. Namboodiri, and C.V. Jawahar. 2021. Visual Speech Enhancement Without a Real Visual Stream. In Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV). 1926--1935.","key":"e_1_3_2_2_21_1"},{"key":"e_1_3_2_2_22_1","volume-title":"6D Rotation Representation For Unconstrained Head Pose Estimation. arXiv preprint arXiv:2202.12555","author":"Hempel Thorsten","year":"2022","unstructured":"Thorsten Hempel , Ahmed A Abdelrahman , and Ayoub Al-Hamadi . 2022. 6D Rotation Representation For Unconstrained Head Pose Estimation. arXiv preprint arXiv:2202.12555 ( 2022 ). Thorsten Hempel, Ahmed A Abdelrahman, and Ayoub Al-Hamadi. 2022. 6D Rotation Representation For Unconstrained Head Pose Estimation. arXiv preprint arXiv:2202.12555 (2022)."},{"unstructured":"Martin Heusel Hubert Ramsauer Thomas Unterthiner Bernhard Nessler and Sepp Hochreiter. 2018. GANs Trained by a Two Time-Scale Update Rule Converge to a Local Nash Equilibrium. arXiv:1706.08500 [cs.LG]  Martin Heusel Hubert Ramsauer Thomas Unterthiner Bernhard Nessler and Sepp Hochreiter. 2018. GANs Trained by a Two Time-Scale Update Rule Converge to a Local Nash Equilibrium. arXiv:1706.08500 [cs.LG]","key":"e_1_3_2_2_23_1"},{"unstructured":"Jonathan Ho Evan Lohn and P. Abbeel. 2019. Compression with Flows via Local Bits-Back Coding. In NeurIPS.  Jonathan Ho Evan Lohn and P. Abbeel. 2019. Compression with Flows via Local Bits-Back Coding. In NeurIPS.","key":"e_1_3_2_2_24_1"},{"key":"e_1_3_2_2_25_1","volume-title":"Finding Tiny Faces. In The IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","author":"Hu Peiyun","year":"2017","unstructured":"Peiyun Hu and Deva Ramanan . 2017 . Finding Tiny Faces. In The IEEE Conference on Computer Vision and Pattern Recognition (CVPR). Peiyun Hu and Deva Ramanan. 2017. Finding Tiny Faces. In The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)."},{"key":"e_1_3_2_2_26_1","volume-title":"Joon Son Chung, and Andrew Zisserman","author":"Jamaludin Amir","year":"2019","unstructured":"Amir Jamaludin , Joon Son Chung, and Andrew Zisserman . 2019 . You said that?: Synthesising talking faces from audio. International Journal of Computer Vision ( 2019). Amir Jamaludin, Joon Son Chung, and Andrew Zisserman. 2019. You said that?: Synthesising talking faces from audio. International Journal of Computer Vision (2019)."},{"key":"e_1_3_2_2_27_1","volume-title":"Super SloMo: High Quality Estimation of Multiple Intermediate Frames for Video Interpolation. 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Jiang Huaizu","year":"2018","unstructured":"Huaizu Jiang , Deqing Sun , V. Jampani , Ming-Hsuan Yang , Erik G. Learned-Miller , and Jan Kautz . 2018 . Super SloMo: High Quality Estimation of Multiple Intermediate Frames for Video Interpolation. 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2018), 9000--9008. Huaizu Jiang, Deqing Sun, V. Jampani, Ming-Hsuan Yang, Erik G. Learned-Miller, and Jan Kautz. 2018. Super SloMo: High Quality Estimation of Multiple Intermediate Frames for Video Interpolation. 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2018), 9000--9008."},{"key":"e_1_3_2_2_28_1","volume-title":"Proceedings of the 27th ACM International Conference on Multimedia","author":"Rudrabha Mukhopadhyay Prajwal K R","year":"2019","unstructured":"Prajwal K R , Rudrabha Mukhopadhyay , Jerin Philip , Abhishek Jha , Vinay Namboodiri , and C V Jawahar . 2019 . Towards Automatic Face-to-Face Translation . In Proceedings of the 27th ACM International Conference on Multimedia ( Nice, France) (MM '19). ACM, 9 pages. https:\/\/doi.org\/10.1145\/3343031.3351066 Prajwal K R, Rudrabha Mukhopadhyay, Jerin Philip, Abhishek Jha, Vinay Namboodiri, and C V Jawahar. 2019. Towards Automatic Face-to-Face Translation. In Proceedings of the 27th ACM International Conference on Multimedia (Nice, France) (MM '19). ACM, 9 pages. https:\/\/doi.org\/10.1145\/3343031.3351066"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_29_1","DOI":"10.1109\/ICCV48922.2021.00448"},{"key":"e_1_3_2_2_30_1","volume-title":"Proceedings of Asian Conference on Computer Vision (ACCV).","author":"Kim Changil","year":"2018","unstructured":"Changil Kim , Hijung Valentina Shin , Tae-Hyun Oh , Alexandre Kaspar , Mohamed Elgharib , and Wojciech Matusik . 2018 . On Learning Associations of Faces and Voices . In Proceedings of Asian Conference on Computer Vision (ACCV). Changil Kim, Hijung Valentina Shin, Tae-Hyun Oh, Alexandre Kaspar, Mohamed Elgharib, and Wojciech Matusik. 2018. On Learning Associations of Faces and Voices. In Proceedings of Asian Conference on Computer Vision (ACCV)."},{"key":"e_1_3_2_2_31_1","volume-title":"Proceedings of the 30th British Machine Vision Conference (BMVC).","author":"Kim Deokyun","year":"2019","unstructured":"Deokyun Kim , Minseon Kim , Gihyun Kwon , and Dae-Shik Kim . 2019 . Progressive Face Super-Resolution via Attention to Facial Landmark . In Proceedings of the 30th British Machine Vision Conference (BMVC). Deokyun Kim, Minseon Kim, Gihyun Kwon, and Dae-Shik Kim. 2019. Progressive Face Super-Resolution via Attention to Facial Landmark. In Proceedings of the 30th British Machine Vision Conference (BMVC)."},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3197517.3201283","article-title":"Deep video portraits","volume":"37","author":"Kim H.","year":"2018","unstructured":"H. Kim , Pablo Garrido , Ayush Tewari , Weipeng Xu , Justus Thies , Matthias Nie\u00dfner , P. P\u00e9rez , Christian Richardt , M. Zollh\u00f6fer , and C. Theobalt . 2018 . Deep video portraits . ACM Transactions on Graphics (TOG) 37 (2018), 1 -- 14 . H. Kim, Pablo Garrido, Ayush Tewari, Weipeng Xu, Justus Thies, Matthias Nie\u00dfner, P. P\u00e9rez, Christian Richardt, M. Zollh\u00f6fer, and C. Theobalt. 2018. Deep video portraits. ACM Transactions on Graphics (TOG) 37 (2018), 1 -- 14.","journal-title":"ACM Transactions on Graphics (TOG)"},{"key":"e_1_3_2_2_33_1","volume-title":"Jung Kwon Lee, and Kyoung Mu Lee","author":"Kim Jiwon","year":"2015","unstructured":"Jiwon Kim , Jung Kwon Lee, and Kyoung Mu Lee . 2015 . Accurate Image Super- Resolution Using Very Deep Convolutional Networks. CoRR abs\/1511.04587 (2015). arXiv:1511.04587 http:\/\/arxiv.org\/abs\/1511.04587 Jiwon Kim, Jung Kwon Lee, and Kyoung Mu Lee. 2015. Accurate Image Super- Resolution Using Very Deep Convolutional Networks. CoRR abs\/1511.04587 (2015). arXiv:1511.04587 http:\/\/arxiv.org\/abs\/1511.04587"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_34_1","DOI":"10.1016\/j.neucom.2021.03.048"},{"key":"e_1_3_2_2_35_1","volume-title":"Enhanced Deep Residual Networks for Single Image Super-Resolution. In 2017 IEEE Conference on Computer Vision and Pattern Recognition Workshops (CVPRW). 1132--1140","author":"Lim Bee","year":"2017","unstructured":"Bee Lim , Sanghyun Son , Heewon Kim , Seungjun Nah , and Kyoung Mu Lee . 2017 . Enhanced Deep Residual Networks for Single Image Super-Resolution. In 2017 IEEE Conference on Computer Vision and Pattern Recognition Workshops (CVPRW). 1132--1140 . https:\/\/doi.org\/10.1109\/CVPRW.2017.151 Bee Lim, Sanghyun Son, Heewon Kim, Seungjun Nah, and Kyoung Mu Lee. 2017. Enhanced Deep Residual Networks for Single Image Super-Resolution. In 2017 IEEE Conference on Computer Vision and Pattern Recognition Workshops (CVPRW). 1132--1140. https:\/\/doi.org\/10.1109\/CVPRW.2017.151"},{"key":"e_1_3_2_2_36_1","volume-title":"Deep Face Super-Resolution With Iterative Collaboration Between Attentive Recovery and Landmark Estimation. 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Ma Cheng","year":"2020","unstructured":"Cheng Ma , Zhenyu Jiang , Yongming Rao , Jiwen Lu , and J. Zhou . 2020 . Deep Face Super-Resolution With Iterative Collaboration Between Attentive Recovery and Landmark Estimation. 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) ( 2020 ), 5568--5577. Cheng Ma, Zhenyu Jiang, Yongming Rao, Jiwen Lu, and J. Zhou. 2020. Deep Face Super-Resolution With Iterative Collaboration Between Attentive Recovery and Landmark Estimation. 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2020), 5568--5577."},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_37_1","DOI":"10.1109\/CVPR42600.2020.00251"},{"key":"e_1_3_2_2_38_1","volume-title":"Seeing Voices and Hearing Faces: Cross-Modal Biometric Matching. IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Nagrani Arsha","year":"2018","unstructured":"Arsha Nagrani , Samuel Albanie , and Andrew Zisserman . 2018 . Seeing Voices and Hearing Faces: Cross-Modal Biometric Matching. IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2018), 8427--8436. Arsha Nagrani, Samuel Albanie, and Andrew Zisserman. 2018. Seeing Voices and Hearing Faces: Cross-Modal Biometric Matching. IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2018), 8427--8436."},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_39_1","DOI":"10.1109\/ATNAC.2013.6705380"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_40_1","DOI":"10.1109\/CVPR.2019.00772"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_41_1","DOI":"10.1109\/CVPRW53098.2021.00271"},{"volume-title":"Proceedings of the 28th ACM International Conference on Multimedia (MM'20)","author":"Prajwal K R","unstructured":"K R Prajwal , Rudrabha Mukhopadhyay , Vinay P. Namboodiri , and C.V. Jawahar . 2020. A Lip Sync Expert Is All You Need for Speech to Lip Generation In the Wild . In Proceedings of the 28th ACM International Conference on Multimedia (MM'20) . 484--492. https:\/\/doi.org\/10.1145\/3394171.3413532 K R Prajwal, Rudrabha Mukhopadhyay, Vinay P. Namboodiri, and C.V. Jawahar. 2020. A Lip Sync Expert Is All You Need for Speech to Lip Generation In the Wild. In Proceedings of the 28th ACM International Conference on Multimedia (MM'20). 484--492. https:\/\/doi.org\/10.1145\/3394171.3413532","key":"e_1_3_2_2_42_1"},{"key":"e_1_3_2_2_43_1","volume-title":"Frame-Recurrent Video Super-Resolution. In The IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","author":"Sajjadi Mehdi S. M.","year":"2018","unstructured":"Mehdi S. M. Sajjadi , Raviteja Vemulapalli , and Matthew Brown . 2018 . Frame-Recurrent Video Super-Resolution. In The IEEE Conference on Computer Vision and Pattern Recognition (CVPR). Mehdi S. M. Sajjadi, Raviteja Vemulapalli, and Matthew Brown. 2018. Frame-Recurrent Video Super-Resolution. In The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)."},{"key":"e_1_3_2_2_44_1","volume-title":"First Order Motion Model for Image Animation. In Conference on Neural Information Processing Systems (NeurIPS).","author":"Siarohin Aliaksandr","year":"2019","unstructured":"Aliaksandr Siarohin , St\u00e9phane Lathuili\u00e8re , Sergey Tulyakov , Elisa Ricci , and Nicu Sebe . 2019 . First Order Motion Model for Image Animation. In Conference on Neural Information Processing Systems (NeurIPS). Aliaksandr Siarohin, St\u00e9phane Lathuili\u00e8re, Sergey Tulyakov, Elisa Ricci, and Nicu Sebe. 2019. First Order Motion Model for Image Animation. In Conference on Neural Information Processing Systems (NeurIPS)."},{"volume-title":"Joint Implicit Image Function for Guided Depth Super-Resolution","author":"Tang Jiaxiang","unstructured":"Jiaxiang Tang , Xiaokang Chen , and Gang Zeng . 2021. Joint Implicit Image Function for Guided Depth Super-Resolution . Association for Computing Machinery , New York, NY, USA , 4390--4399. https:\/\/doi.org\/10.1145\/3474085.3475584 Jiaxiang Tang, Xiaokang Chen, and Gang Zeng. 2021. Joint Implicit Image Function for Guided Depth Super-Resolution. Association for Computing Machinery, New York, NY, USA, 4390--4399. https:\/\/doi.org\/10.1145\/3474085.3475584","key":"e_1_3_2_2_45_1"},{"key":"e_1_3_2_2_46_1","first-page":"1","article-title":"Face2Face","volume":"62","author":"Thies Justus","year":"2018","unstructured":"Justus Thies , Michael Zollh\u00f6fer , Marc Stamminger , Christian Theobalt , and Matthias Nie\u00dfner . 2018 . Face2Face : Real-Time Face Capture and Reenactment of RGB Videos. Commun. ACM 62 , 1 (Dec. 2018), 96--104. https:\/\/doi.org\/10.1145\/3292039 Justus Thies, Michael Zollh\u00f6fer, Marc Stamminger, Christian Theobalt, and Matthias Nie\u00dfner. 2018. Face2Face: Real-Time Face Capture and Reenactment of RGB Videos. Commun. ACM 62, 1 (Dec. 2018), 96--104. https:\/\/doi.org\/10.1145\/3292039","journal-title":"Real-Time Face Capture and Reenactment of RGB Videos. Commun. ACM"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_47_1","DOI":"10.1109\/CVPRW.2018.00130"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_48_1","DOI":"10.1109\/TPAMI.2007.1055"},{"unstructured":"Ting-Chun Wang Ming-Yu Liu Andrew Tao Guilin Liu Jan Kautz and Bryan Catanzaro. 2019. Few-shot Video-to-Video Synthesis. In Advances in Neural Information Processing Systems (NeurIPS).  Ting-Chun Wang Ming-Yu Liu Andrew Tao Guilin Liu Jan Kautz and Bryan Catanzaro. 2019. Few-shot Video-to-Video Synthesis. In Advances in Neural Information Processing Systems (NeurIPS).","key":"e_1_3_2_2_49_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_50_1","DOI":"10.1109\/CVPR46437.2021.00991"},{"volume-title":"Imitating Arbitrary Talking Style for Realistic Audio-Driven Talking Face Synthesis","author":"Wu Haozhe","unstructured":"Haozhe Wu , Jia Jia , Haoyu Wang , Yishun Dou , Chao Duan , and Qingshan Deng . 2021. Imitating Arbitrary Talking Style for Realistic Audio-Driven Talking Face Synthesis . Association for Computing Machinery , New York, NY, USA , 1478--1486. https:\/\/doi.org\/10.1145\/3474085.3475280 Haozhe Wu, Jia Jia, Haoyu Wang, Yishun Dou, Chao Duan, and Qingshan Deng. 2021. Imitating Arbitrary Talking Style for Realistic Audio-Driven Talking Face Synthesis. Association for Computing Machinery, New York, NY, USA, 1478--1486. https:\/\/doi.org\/10.1145\/3474085.3475280","key":"e_1_3_2_2_51_1"},{"key":"e_1_3_2_2_52_1","volume-title":"Advances in Neural Information Processing Systems","volume":"27","author":"Xu Li","year":"2014","unstructured":"Li Xu , Jimmy SJ Ren , Ce Liu , and Jiaya Jia . 2014 . Deep Convolutional Neural Network for Image Deconvolution . In Advances in Neural Information Processing Systems , Vol. 27 . Curran Associates, Inc. Li Xu, Jimmy SJ Ren, Ce Liu, and Jiaya Jia. 2014. Deep Convolutional Neural Network for Image Deconvolution. In Advances in Neural Information Processing Systems, Vol. 27. Curran Associates, Inc."},{"volume-title":"Super Resolution Using Dual Path Connections (MM '19)","author":"Yang Bin-Cheng","unstructured":"Bin-Cheng Yang . 2019. Super Resolution Using Dual Path Connections (MM '19) . Association for Computing Machinery , New York, NY, USA , 1552--1560. https:\/\/doi.org\/10.1145\/3343031.3350878 Bin-Cheng Yang. 2019. Super Resolution Using Dual Path Connections (MM '19). Association for Computing Machinery, New York, NY, USA, 1552--1560. https:\/\/doi.org\/10.1145\/3343031.3350878","key":"e_1_3_2_2_53_1"},{"key":"e_1_3_2_2_54_1","volume-title":"Iterative Text-based Editing of Talking-heads Using Neural Retargeting. ArXiv abs\/2011.10688","author":"Yao Xin-Wei","year":"2020","unstructured":"Xin-Wei Yao , Ohad Fried , K. Fatahalian , and Maneesh Agrawala . 2020. Iterative Text-based Editing of Talking-heads Using Neural Retargeting. ArXiv abs\/2011.10688 ( 2020 ). Xin-Wei Yao, Ohad Fried, K. Fatahalian, and Maneesh Agrawala. 2020. Iterative Text-based Editing of Talking-heads Using Neural Retargeting. ArXiv abs\/2011.10688 (2020)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_55_1","DOI":"10.1109\/ICCV48922.2021.00384"},{"key":"e_1_3_2_2_56_1","volume-title":"2017 IEEE International Conference on Computer Vision (ICCV)","author":"Zhang Shifeng","year":"2017","unstructured":"Shifeng Zhang , Xiangyu Zhu , Zhen Lei , Hailin Shi , Xiaobo Wang , and S. Li . 2017. S3FD: Single Shot Scale-Invariant Face Detector . 2017 IEEE International Conference on Computer Vision (ICCV) ( 2017 ), 192--201. Shifeng Zhang, Xiangyu Zhu, Zhen Lei, Hailin Shi, Xiaobo Wang, and S. Li. 2017. S3FD: Single Shot Scale-Invariant Face Detector. 2017 IEEE International Conference on Computer Vision (ICCV) (2017), 192--201."},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_57_1","DOI":"10.1109\/CVPR46437.2021.00416"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_58_1","DOI":"10.1145\/3414685.3417774"}],"event":{"sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"acronym":"MM '22","name":"MM '22: The 30th ACM International Conference on Multimedia","location":"Lisboa Portugal"},"container-title":["Proceedings of the 30th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503161.3548080","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3503161.3548080","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:00:18Z","timestamp":1750186818000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503161.3548080"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,10,10]]},"references-count":58,"alternative-id":["10.1145\/3503161.3548080","10.1145\/3503161"],"URL":"https:\/\/doi.org\/10.1145\/3503161.3548080","relation":{},"subject":[],"published":{"date-parts":[[2022,10,10]]},"assertion":[{"value":"2022-10-10","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}