{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T18:19:54Z","timestamp":1775067594351,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,10,17]],"date-time":"2021-10-17T00:00:00Z","timestamp":1634428800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Guangdong Provincial Key Field Research and Development Plan Project"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,10,17]]},"DOI":"10.1145\/3474085.3475420","type":"proceedings-article","created":{"date-parts":[[2021,10,18]],"date-time":"2021-10-18T04:52:26Z","timestamp":1634532746000},"page":"2492-2500","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":14,"title":["CALLip: Lipreading using Contrastive and Attribute Learning"],"prefix":"10.1145","author":[{"given":"Yiyang","family":"Huang","sequence":"first","affiliation":[{"name":"Xidian University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xuefeng","family":"Liang","sequence":"additional","affiliation":[{"name":"Xidian University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chaowei","family":"Fang","sequence":"additional","affiliation":[{"name":"Xidian University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2021,10,17]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"International Workshop on Challenges in Hearing Assistive Technology (CHAT-2017)","author":"Adeel Ahsan","year":"2017"},{"key":"e_1_3_2_1_2_1","volume-title":"LipNet: Sentence-level Lipreading. preprint arXiv:1611.01599","author":"Assael Yannis M.","year":"2016"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.5555\/3454287.3455679"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2017.07.001"},{"key":"e_1_3_2_1_5_1","volume-title":"Hinton","author":"Chen Ting","year":"2020"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413623"},{"key":"e_1_3_2_1_7_1","unstructured":"Kyunghyun Cho Bart van Merrienboer \u00c7aglar G\u00fcl\u00e7ehre Dzmitry Bahdanau Fethi Bougares Holger Schwenk and Yoshua Bengio. 2014. Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation. In EMNLP. 1724--1734. Kyunghyun Cho Bart van Merrienboer \u00c7aglar G\u00fcl\u00e7ehre Dzmitry Bahdanau Fethi Bougares Holger Schwenk and Yoshua Bengio. 2014. Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation. In EMNLP. 1724--1734."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Joon Son Chung Andrew W. Senior Oriol Vinyals and Andrew Zisserman. 2017. Lip Reading Sentences in the Wild. In CVPR. 3444--3453. Joon Son Chung Andrew W. Senior Oriol Vinyals and Andrew Zisserman. 2017. Lip Reading Sentences in the Wild. In CVPR. 3444--3453.","DOI":"10.1109\/CVPR.2017.367"},{"key":"e_1_3_2_1_9_1","volume-title":"Out of Time: Automated Lip Sync in the Wild. In ACCV Workshops (2). 251--263","author":"Chung Joon Son","year":"2016"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1121\/1.2229005"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"Ivan Fung and Brian Mak. 2018. End-To-End Low-Resource Lip-Reading with Maxout Cnn and Lstm. In ICASSP. 2511--2515. Ivan Fung and Brian Mak. 2018. End-To-End Low-Resource Lip-Reading with Maxout Cnn and Lstm. In ICASSP. 2511--2515.","DOI":"10.1109\/ICASSP.2018.8462280"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CCST.2002.1049223"},{"key":"e_1_3_2_1_13_1","volume-title":"Bilal Piot, Koray Kavukcuoglu, R\u00e9mi Munos, and Michal Valko.","author":"Grill Jean-Bastien","year":"2020"},{"key":"e_1_3_2_1_14_1","volume-title":"Girshick","author":"He Kaiming","year":"2020"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1016\/S0167-739X(03)00145-6"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.5555\/1577069.1755843"},{"key":"e_1_3_2_1_17_1","volume-title":"Kingma and Jimmy Ba","author":"Diederik","year":"2015"},{"key":"e_1_3_2_1_18_1","volume-title":"Stern","author":"Kumar Kshitiz","year":"2007"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"Yunfan Liu Qi Li and Zhenan Sun. 2019. Attribute-Aware Face Aging With Wavelet-Based Generative Adversarial Networks. In CVPR. 11877--11886. Yunfan Liu Qi Li and Zhenan Sun. 2019. Attribute-Aware Face Aging With Wavelet-Based Generative Adversarial Networks. In CVPR. 11877--11886.","DOI":"10.1109\/CVPR.2019.01215"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Yunfan Liu Qi Li and Zhenan Sun. 2019. Attribute-Aware Face Aging With Wavelet-Based Generative Adversarial Networks. In CVPR. 11877--11886. Yunfan Liu Qi Li and Zhenan Sun. 2019. Attribute-Aware Face Aging With Wavelet-Based Generative Adversarial Networks. In CVPR. 11877--11886.","DOI":"10.1109\/CVPR.2019.01215"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.25080\/Majora-7b98e3ed-003"},{"key":"e_1_3_2_1_22_1","volume-title":"Hearing lips and seeing voices. Nature 264, 5588","author":"McGurk Harry","year":"1976"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.5555\/3454287.3455008"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Jesus F. Guitarte Perez Alejandro F. Frangi Eduardo Lleida-Solano and Klaus Lukas. 2005. Lip Reading for Robust Speech Recognition on Embedded Devices. In ICASSP (1). 473--476. Jesus F. Guitarte Perez Alejandro F. Frangi Eduardo Lleida-Solano and Klaus Lukas. 2005. Lip Reading for Robust Speech Recognition on Embedded Devices. In ICASSP (1). 473--476.","DOI":"10.1109\/ICASSP.2005.1415153"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"Stavros Petridis Themos Stafylakis Pingchuan Ma Georgios Tzimiropoulos and Maja Pantic. 2018. Audio-Visual Speech Recognition with a Hybrid CTC\/Attention Architecture. In SLT. 513--520. Stavros Petridis Themos Stafylakis Pingchuan Ma Georgios Tzimiropoulos and Maja Pantic. 2018. Audio-Visual Speech Recognition with a Hybrid CTC\/Attention Architecture. In SLT. 513--520.","DOI":"10.1109\/SLT.2018.8639643"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"K. R. Prajwal Rudrabha Mukhopadhyay Vinay P. Namboodiri and C. V. Jawahar. 2020. Learning Individual Speaking Styles for Accurate Lip to Speech Synthesis. In CVPR. 13793--13802. K. R. Prajwal Rudrabha Mukhopadhyay Vinay P. Namboodiri and C. V. Jawahar. 2020. Learning Individual Speaking Styles for Accurate Lip to Speech Synthesis. In CVPR. 13793--13802.","DOI":"10.1109\/CVPR42600.2020.01381"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","unstructured":"Yuankai Qi Shengping Zhang Weigang Zhang Li Su Qingming Huang and Ming-Hsuan Yang. 2019. Learning Attribute-Specific Representations for VisualTracking. In AAAI. 8835--8842. Yuankai Qi Shengping Zhang Weigang Zhang Li Su Qingming Huang and Ming-Hsuan Yang. 2019. Learning Attribute-Specific Representations for VisualTracking. In AAAI. 8835--8842.","DOI":"10.1609\/aaai.v33i01.33018835"},{"key":"e_1_3_2_1_28_1","unstructured":"Sashank J. Reddi Satyen Kale and Sanjiv Kumar. 2018. On the Convergence of Adam and Beyond. In ICLR. Sashank J. Reddi Satyen Kale and Sanjiv Kumar. 2018. On the Convergence of Adam and Beyond. In ICLR."},{"key":"e_1_3_2_1_29_1","volume-title":"Smoothgrad: removing noise by adding noise. preprint arXiv:1706.03825","author":"Smilkov Daniel","year":"2017"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"David Snyder Pegah Ghahremani Daniel Povey Daniel Garcia-Romero Yishay Carmiel and Sanjeev Khudanpur. 2016. Deep neural network-based speaker embeddings for end-to-end speaker verification. In SLT. 165--170. David Snyder Pegah Ghahremani Daniel Povey Daniel Garcia-Romero Yishay Carmiel and Sanjeev Khudanpur. 2016. Deep neural network-based speaker embeddings for end-to-end speaker verification. In SLT. 165--170.","DOI":"10.1109\/SLT.2016.7846260"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.5555\/3157096.3157304"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"Themos Stafylakis and Georgios Tzimiropoulos. 2017. Combining Residual Networks with LSTMs for Lipreading. In INTERSPEECH. 3652--3656. Themos Stafylakis and Georgios Tzimiropoulos. 2017. Combining Residual Networks with LSTMs for Lipreading. In INTERSPEECH. 3652--3656.","DOI":"10.21437\/Interspeech.2017-85"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"Yonglong Tian Dilip Krishnan and Phillip Isola. 2020. Contrastive Multiview Coding. In ECCV (11). 776--794. Yonglong Tian Dilip Krishnan and Phillip Isola. 2020. Contrastive Multiview Coding. In ECCV (11). 776--794.","DOI":"10.1007\/978-3-030-58621-8_45"},{"key":"e_1_3_2_1_34_1","volume-title":"Representation Learning with Contrastive Predictive Coding. preprint arXiv:1807.03748","author":"van den Oord A\u00e4ron","year":"2018"},{"key":"e_1_3_2_1_35_1","article-title":"Visualizing data using t-SNE","volume":"9","author":"der Maaten Laurens Van","year":"2008","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.5555\/3295222.3295349"},{"key":"e_1_3_2_1_37_1","unstructured":"Zhirong Wu Yuanjun Xiong Stella X. Yu and Dahua Lin. 2018. Unsupervised Feature Learning via Non-Parametric Instance Discrimination. In CVPR. 3733--3742. Zhirong Wu Yuanjun Xiong Stella X. Yu and Dahua Lin. 2018. Unsupervised Feature Learning via Non-Parametric Instance Discrimination. In CVPR. 3733--3742."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"crossref","unstructured":"Bo Xu Cheng Lu Yandong Guo and Jacob Wang. 2020. Discriminative Multi- Modality Speech Recognition. In CVPR. 14421--14430. Bo Xu Cheng Lu Yandong Guo and Jacob Wang. 2020. Discriminative Multi- Modality Speech Recognition. In CVPR. 14421--14430.","DOI":"10.1109\/CVPR42600.2020.01444"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"Kai Xu Dawei Li Nick Cassimatis and Xiaolong Wang. 2018. LCANet: End-to- End Lipreading with Cascaded Attention-CTC. In FG. 548--555. Kai Xu Dawei Li Nick Cassimatis and Xiaolong Wang. 2018. LCANet: End-to- End Lipreading with Cascaded Attention-CTC. In FG. 548--555.","DOI":"10.1109\/FG.2018.00088"},{"key":"e_1_3_2_1_40_1","volume-title":"Koji Okabe, and Takafumi Koshinaka.","author":"Yamamoto Hitoshi","year":"2019"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3338533.3366579"},{"key":"e_1_3_2_1_42_1","volume-title":"Hearing Lips: Improving Lip Reading by Distilling Speech Recognizers. In AAAI. 6917--6924.","author":"Zhao Ya","year":"2020"}],"event":{"name":"MM '21: ACM Multimedia Conference","location":"Virtual Event China","acronym":"MM '21","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 29th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475420","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3474085.3475420","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:48:32Z","timestamp":1750193312000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475420"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,17]]},"references-count":42,"alternative-id":["10.1145\/3474085.3475420","10.1145\/3474085"],"URL":"https:\/\/doi.org\/10.1145\/3474085.3475420","relation":{},"subject":[],"published":{"date-parts":[[2021,10,17]]},"assertion":[{"value":"2021-10-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}