{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T17:07:56Z","timestamp":1730221676928,"version":"3.28.0"},"reference-count":31,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,5,27]],"date-time":"2024-05-27T00:00:00Z","timestamp":1716768000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,5,27]],"date-time":"2024-05-27T00:00:00Z","timestamp":1716768000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,5,27]]},"DOI":"10.1109\/fg59268.2024.10581985","type":"proceedings-article","created":{"date-parts":[[2024,7,11]],"date-time":"2024-07-11T17:40:08Z","timestamp":1720719608000},"page":"1-5","source":"Crossref","is-referenced-by-count":0,"title":["Lip and Speech Synchronization using Supervised Contrastive Learning and Cross-Modal Attention"],"prefix":"10.1109","author":[{"given":"Munender","family":"Varshney","sequence":"first","affiliation":[{"name":"R&#x0026;D Centre, Hitachi India Pvt Ltd,Bangalore,India"}]},{"given":"Mayurakshi","family":"Mukherji","sequence":"additional","affiliation":[{"name":"R&#x0026;D Centre, Hitachi India Pvt Ltd,Bangalore,India"}]},{"given":"Senthil Raja","family":"G","sequence":"additional","affiliation":[{"name":"R&#x0026;D Centre, Hitachi India Pvt Ltd,Bangalore,India"}]},{"given":"Ananth","family":"Ganesh","sequence":"additional","affiliation":[{"name":"R&#x0026;D Centre, Hitachi India Pvt Ltd,Bangalore,India"}]},{"given":"Kingshuk","family":"Banerjee","sequence":"additional","affiliation":[{"name":"R&#x0026;D Centre, Hitachi India Pvt Ltd,Bangalore,India"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2018.2889052"},{"key":"ref2","article-title":"Relative timing of sound and vision for broadcasting","author":"Bt","year":"1998","journal-title":"Relative timing of sound and vision for broadcasting"},{"key":"ref3","article-title":"Audio-visual synchronisation in the wild","author":"Chen","year":"2021","journal-title":"arXiv preprint"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/icassp48485.2024.10446372"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-54427-4_19"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682524"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414924"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2021.3134634"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00518"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3152247"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10861"},{"key":"ref12","article-title":"Supervised contrastive learning","author":"Khosla","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref13","first-page":"25","article-title":"On attention modules for audio-visual synchronization","volume-title":"CVPR Workshops","author":"Khosravan"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383616"},{"key":"ref15","article-title":"Adam: A method for stochastic optimization","author":"Kingma","year":"2014","journal-title":"arXiv preprint"},{"key":"ref16","article-title":"Cooperative learning of audio and video models from self-supervised synchronization","volume":"31","author":"Korbar","year":"2018","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ACSSC.1994.471519"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1002\/vis.4340020404"},{"key":"ref19","article-title":"Lip synchronization of speech","author":"McAllister","year":"1997","journal-title":"Audio-Visual Speech Processing: Computational & Cognitive Science Approaches"},{"key":"ref20","article-title":"A cappella: Audio-visual singing voice separation","author":"Montesinos","year":"2021","journal-title":"arXiv preprint"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2002.5745053"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00721"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_39"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20102"},{"journal-title":"Multi-modal self-supervision from generalized data transformations","year":"2020","author":"Patrick","key":"ref25"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413532"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413532"},{"key":"ref28","article-title":"Sdnerf: Towards lifelike talking head animation via spatially-adaptive dual-driven nerfs","author":"Shen","year":"2023","journal-title":"IEEE Transactions on Multimedia"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.5555\/3298023.3298188"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref31","article-title":"Dive into deep learning","author":"Zhang","year":"2021","journal-title":"arXiv preprint"}],"event":{"name":"2024 IEEE 18th International Conference on Automatic Face and Gesture Recognition (FG)","start":{"date-parts":[[2024,5,27]]},"location":"Istanbul, Turkiye","end":{"date-parts":[[2024,5,31]]}},"container-title":["2024 IEEE 18th International Conference on Automatic Face and Gesture Recognition (FG)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10581880\/10581860\/10581985.pdf?arnumber=10581985","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,7,18]],"date-time":"2024-07-18T05:11:41Z","timestamp":1721279501000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10581985\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,27]]},"references-count":31,"URL":"https:\/\/doi.org\/10.1109\/fg59268.2024.10581985","relation":{},"subject":[],"published":{"date-parts":[[2024,5,27]]}}}