{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,12]],"date-time":"2026-02-12T17:26:58Z","timestamp":1770917218286,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,10,17]],"date-time":"2021-10-17T00:00:00Z","timestamp":1634428800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,10,17]]},"DOI":"10.1145\/3474085.3475196","type":"proceedings-article","created":{"date-parts":[[2021,10,18]],"date-time":"2021-10-18T10:23:20Z","timestamp":1634552600000},"page":"478-486","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":18,"title":["TACR-Net: Editing on Deep Video and Voice Portraits"],"prefix":"10.1145","author":[{"given":"Luchuan","family":"Song","sequence":"first","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bin","family":"Liu","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Guojun","family":"Yin","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaoyi","family":"Dong","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yufei","family":"Zhang","sequence":"additional","affiliation":[{"name":"Meituan, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jia-Xuan","family":"Bai","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2021,10,17]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Neural machine translation by jointly learning to align and translate. arXiv preprint arXiv:1409.0473","author":"Bahdanau Dzmitry","year":"2014","unstructured":"Dzmitry Bahdanau , Kyunghyun Cho , and Yoshua Bengio . Neural machine translation by jointly learning to align and translate. arXiv preprint arXiv:1409.0473 , 2014 . Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. Neural machine translation by jointly learning to align and translate. arXiv preprint arXiv:1409.0473, 2014."},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/311535.311556"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58545-7_3"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00802"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"e_1_3_2_2_6_1","volume-title":"Workshop on Multi-view Lip-reading, ACCV","author":"Chung J. S.","year":"2016","unstructured":"J. S. Chung and A. Zisserman . Out of time: automated lip sync in the wild . In Workshop on Multi-view Lip-reading, ACCV , 2016 . J. S. Chung and A. Zisserman. Out of time: automated lip sync in the wild. In Workshop on Multi-view Lip-reading, ACCV, 2016."},{"key":"e_1_3_2_2_7_1","first-page":"87","volume-title":"Asian Conference on Computer Vision","author":"Chung Joon Son","year":"2016","unstructured":"Joon Son Chung and Andrew Zisserman . Lip reading in the wild . In Asian Conference on Computer Vision , pages 87 -- 103 . Springer , 2016 . Joon Son Chung and Andrew Zisserman. Lip reading in the wild. In Asian Conference on Computer Vision, pages 87--103. Springer, 2016."},{"key":"e_1_3_2_2_8_1","first-page":"251","volume-title":"Asian conference on computer vision","author":"Chung Joon Son","year":"2016","unstructured":"Joon Son Chung and Andrew Zisserman . Out of time: automated lip sync in the wild . In Asian conference on computer vision , pages 251 -- 263 . Springer , 2016 . Joon Son Chung and Andrew Zisserman. Out of time: automated lip sync in the wild. In Asian conference on computer vision, pages 251--263. Springer, 2016."},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1121\/1.2229005"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2019.00038"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3306346.3323028"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/2638549"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1111\/cgf.12552"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"key":"e_1_3_2_2_15_1","volume-title":"Cnn-based real-time dense face reconstruction with inverse-rendered photo-realistic face images","author":"Guo Yudong","year":"2018","unstructured":"Yudong Guo , Jianfei Cai , Boyi Jiang , Jianmin Zheng , Cnn-based real-time dense face reconstruction with inverse-rendered photo-realistic face images . IEEE transactions on pattern analysis and machine intelligence, 41(6):1294--1307, 2018 . Yudong Guo, Jianfei Cai, Boyi Jiang, Jianmin Zheng, et al. Cnn-based real-time dense face reconstruction with inverse-rendered photo-realistic face images. IEEE transactions on pattern analysis and machine intelligence, 41(6):1294--1307, 2018."},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.5555\/646247.685022"},{"key":"e_1_3_2_2_17_1","volume-title":"Deep speech: Scaling up end-to-end speech recognition","author":"Hannun Awni","year":"2014","unstructured":"Awni Hannun , Carl Case , Jared Casper , Bryan Catanzaro , Greg Diamos , Erich Elsen , Ryan Prenger , Sanjeev Satheesh , Shubho Sengupta , Adam Coates , and Andrew Y. Ng . Deep speech: Scaling up end-to-end speech recognition , 2014 . Awni Hannun, Carl Case, Jared Casper, Bryan Catanzaro, Greg Diamos, Erich Elsen, Ryan Prenger, Sanjeev Satheesh, Shubho Sengupta, Adam Coates, and Andrew Y. Ng. Deep speech: Scaling up end-to-end speech recognition, 2014."},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.179"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.632"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3355089.3356500"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3197517.3201283"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953075"},{"key":"e_1_3_2_2_23_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba . Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 , 2014 . Diederik P Kingma and Jimmy Ba. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980, 2014."},{"key":"e_1_3_2_2_24_1","volume-title":"Obamanet: Photo-realistic lip-sync from text","author":"Kumar Rithesh","year":"2017","unstructured":"Rithesh Kumar , Jose Sotelo , Kundan Kumar , Alexandre de Brebisson , and Yoshua Bengio . Obamanet: Photo-realistic lip-sync from text , 2017 . Rithesh Kumar, Jose Sotelo, Kundan Kumar, Alexandre de Brebisson, and Yoshua Bengio. Obamanet: Photo-realistic lip-sync from text, 2017."},{"key":"e_1_3_2_2_25_1","first-page":"2","article-title":"Toward a practical perceptual video quality metric","volume":"6","author":"Li Zhi","year":"2016","unstructured":"Zhi Li , Anne Aaron , Ioannis Katsavounidis , Anush Moorthy , and Megha Manohara . Toward a practical perceptual video quality metric . The Netflix Tech Blog , 6 : 2 , 2016 . Zhi Li, Anne Aaron, Ioannis Katsavounidis, Anush Moorthy, and Megha Manohara. Toward a practical perceptual video quality metric. The Netflix Tech Blog, 6:2, 2016.","journal-title":"The Netflix Tech Blog"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.106"},{"key":"e_1_3_2_2_27_1","volume-title":"World-consistent video-to-video synthesis. arXiv preprint arXiv:2007.08509","author":"Mallya Arun","year":"2020","unstructured":"Arun Mallya , Ting-Chun Wang , Karan Sapra , and Ming-Yu Liu . World-consistent video-to-video synthesis. arXiv preprint arXiv:2007.08509 , 2020 . Arun Mallya, Ting-Chun Wang, Karan Sapra, and Ming-Yu Liu. World-consistent video-to-video synthesis. arXiv preprint arXiv:2007.08509, 2020."},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2015.7404790"},{"key":"e_1_3_2_2_29_1","volume-title":"Conditional generative adversarial nets. arXiv preprint arXiv:1411.1784","author":"Mirza Mehdi","year":"2014","unstructured":"Mehdi Mirza and Simon Osindero . Conditional generative adversarial nets. arXiv preprint arXiv:1411.1784 , 2014 . Mehdi Mirza and Simon Osindero. Conditional generative adversarial nets. arXiv preprint arXiv:1411.1784, 2014."},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-950"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1207\/s15327051hci0804_4"},{"key":"e_1_3_2_2_32_1","volume-title":"Wavenet: A generative model for raw audio. arXiv preprint arXiv:1609.03499","author":"van den Oord Aaron","year":"2016","unstructured":"Aaron van den Oord , Sander Dieleman , Heiga Zen , Karen Simonyan , Oriol Vinyals , Alex Graves , Nal Kalchbrenner , Andrew Senior , and Koray Kavukcuoglu . Wavenet: A generative model for raw audio. arXiv preprint arXiv:1609.03499 , 2016 . Aaron van den Oord, Sander Dieleman, Heiga Zen, Karen Simonyan, Oriol Vinyals, Alex Graves, Nal Kalchbrenner, Andrew Senior, and Koray Kavukcuoglu. Wavenet: A generative model for raw audio. arXiv preprint arXiv:1609.03499, 2016."},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-647"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413532"},{"key":"e_1_3_2_2_35_1","unstructured":"Kaizhi\n      Qian Yang\n      Zhang Shiyu\n      Chang Xuesong\n      Yang and \n      Mark\n      Hasegawa-Johnson\n    .\n  AutoVC: Zero-shot voice style transfer with only autoencoder loss\n  . volume \n  97\n   of \n  Proceedings of Machine Learning Research pages \n  5210\n  --\n  5219 Long Beach California USA 09--15 Jun \n  2019\n  . \n  PMLR.  Kaizhi Qian Yang Zhang Shiyu Chang Xuesong Yang and Mark Hasegawa-Johnson. AutoVC: Zero-shot voice style transfer with only autoencoder loss. volume 97 of Proceedings of Machine Learning Research pages 5210--5219 Long Beach California USA 09--15 Jun 2019. PMLR."},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"e_1_3_2_2_37_1","first-page":"4197","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"Roth Joseph","year":"2016","unstructured":"Joseph Roth , Yiying Tong , and Xiaoming Liu . Adaptive 3d face reconstruction from unconstrained photo collections . In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition , pages 4197 -- 4206 , 2016 . Joseph Roth, Yiying Tong, and Xiaoming Liu. Adaptive 3d face reconstruction from unconstrained photo collections. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pages 4197--4206, 2016."},{"key":"e_1_3_2_2_38_1","first-page":"604","volume-title":"2015 IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)","author":"Sak Hacsim","year":"2015","unstructured":"Hacsim Sak , F\u00e9lix de Chaumont Quitry , Tara Sainath , Kanishka Rao , Acoustic modelling with cd-ctc-smbr lstm rnns . In 2015 IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU) , pages 604 -- 609 . IEEE, 2015 . Hacsim Sak, F\u00e9lix de Chaumont Quitry, Tara Sainath, Kanishka Rao, et al. Acoustic modelling with cd-ctc-smbr lstm rnns. In 2015 IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU), pages 604--609. IEEE, 2015."},{"key":"e_1_3_2_2_39_1","volume-title":"First order motion model for image animation. arXiv preprint arXiv:2003.00196","author":"Siarohin Aliaksandr","year":"2020","unstructured":"Aliaksandr Siarohin , St\u00e9phane Lathuili\u00e8re , Sergey Tulyakov , Elisa Ricci , and Nicu Sebe . First order motion model for image animation. arXiv preprint arXiv:2003.00196 , 2020 . Aliaksandr Siarohin, St\u00e9phane Lathuili\u00e8re, Sergey Tulyakov, Elisa Ricci, and Nicu Sebe. First order motion model for image animation. arXiv preprint arXiv:2003.00196, 2020."},{"key":"e_1_3_2_2_40_1","volume-title":"Everybody's talkin': Let me talk as you want. arXiv preprint arXiv:2001.05201","author":"Song Linsen","year":"2020","unstructured":"Linsen Song , Wayne Wu , Chen Qian , Ran He , and Chen Change Loy . Everybody's talkin': Let me talk as you want. arXiv preprint arXiv:2001.05201 , 2020 . Linsen Song, Wayne Wu, Chen Qian, Ran He, and Chen Change Loy. Everybody's talkin': Let me talk as you want. arXiv preprint arXiv:2001.05201, 2020."},{"key":"e_1_3_2_2_41_1","volume-title":"Talking face generation by conditional recurrent adversarial network. arXiv preprint arXiv:1804.04786","author":"Song Yang","year":"2018","unstructured":"Yang Song , Jingwen Zhu , Dawei Li , Xiaolong Wang , and Hairong Qi . Talking face generation by conditional recurrent adversarial network. arXiv preprint arXiv:1804.04786 , 2018 . Yang Song, Jingwen Zhu, Dawei Li, Xiaolong Wang, and Hairong Qi. Talking face generation by conditional recurrent adversarial network. arXiv preprint arXiv:1804.04786, 2018."},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073640"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/2929464.2929475"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58517-4_42"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3306346.3323035"},{"key":"e_1_3_2_2_46_1","volume-title":"Attention is all you need. arXiv preprint arXiv:1706.03762","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani , Noam Shazeer , Niki Parmar , Jakob Uszkoreit , Llion Jones , Aidan N Gomez , Lukasz Kaiser , and Illia Polosukhin . Attention is all you need. arXiv preprint arXiv:1706.03762 , 2017 . Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. Attention is all you need. arXiv preprint arXiv:1706.03762, 2017."},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-019-01251-8"},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.5555\/3454287.3454738"},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.5555\/3326943.3327049"},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.2982166"},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053795"},{"key":"e_1_3_2_2_52_1","volume-title":"Audio-driven talking face video generation with learning-based personalized head pose. arXiv e-prints","author":"Yi Ran","year":"2002","unstructured":"Ran Yi , Zipeng Ye , Juyong Zhang , Hujun Bao , and Yong-Jin Liu . Audio-driven talking face video generation with learning-based personalized head pose. arXiv e-prints , pages arXiv-- 2002 , 2020. Ran Yi, Zipeng Ye, Juyong Zhang, Hujun Bao, and Yong-Jin Liu. Audio-driven talking face video generation with learning-based personalized head pose. arXiv e-prints, pages arXiv--2002, 2020."},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCI.2016.2644865"},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3414685.3417774"}],"event":{"name":"MM '21: ACM Multimedia Conference","location":"Virtual Event China","acronym":"MM '21","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 29th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475196","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3474085.3475196","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:48:47Z","timestamp":1750193327000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475196"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,17]]},"references-count":54,"alternative-id":["10.1145\/3474085.3475196","10.1145\/3474085"],"URL":"https:\/\/doi.org\/10.1145\/3474085.3475196","relation":{},"subject":[],"published":{"date-parts":[[2021,10,17]]},"assertion":[{"value":"2021-10-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}