{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,20]],"date-time":"2025-10-20T10:28:09Z","timestamp":1760956089139},"reference-count":22,"publisher":"IEEE","license":[{"start":{"date-parts":[[2021,6,6]],"date-time":"2021-06-06T00:00:00Z","timestamp":1622937600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,6,6]],"date-time":"2021-06-06T00:00:00Z","timestamp":1622937600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,6,6]]},"DOI":"10.1109\/icassp39728.2021.9414924","type":"proceedings-article","created":{"date-parts":[[2021,5,13]],"date-time":"2021-05-13T19:53:45Z","timestamp":1620935625000},"page":"4345-4349","source":"Crossref","is-referenced-by-count":6,"title":["Detection of Audio-Video Synchronization Errors Via Event Detection"],"prefix":"10.1109","author":[{"given":"Joshua P.","family":"Ebenezer","sequence":"first","affiliation":[{"name":"Amazon Prime Video,Seattle,WA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yongjun","family":"Wu","sequence":"additional","affiliation":[{"name":"Amazon Prime Video,Seattle,WA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hai","family":"Wei","sequence":"additional","affiliation":[{"name":"Amazon Prime Video,Seattle,WA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sriram","family":"Sethuraman","sequence":"additional","affiliation":[{"name":"Amazon Prime Video,Seattle,WA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zongyi","family":"Liu","sequence":"additional","affiliation":[{"name":"Amazon Prime Video,Seattle,WA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2019.2908700"},{"key":"ref12","article-title":"Deep audio-visual speech recognition","author":"afouras","year":"2018","journal-title":"IEEE Trans Pattern Anal Machine Intell"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.2478\/eletel-2014-0042"},{"key":"ref14","article-title":"Rethinking cnn models for audio classification","author":"palanisamy","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00675"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO.2017.8081508"},{"key":"ref22","article-title":"UCF101: A dataset of 101 human actions classes from videos in the wild","author":"soomro","year":"2012","journal-title":"ArXiv Preprint"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1111\/2041-210X.13103"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICVRV.2017.00070"},{"journal-title":"ITU Rec J 248","year":"2008","key":"ref1"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00559"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.33682\/8axe-9243"},{"key":"ref19","first-page":"5693","article-title":"Deep highresolution representation learning for human pose estimation","author":"sun","year":"0","journal-title":"IEEE Conf Comp Vis Pattern Recognit"},{"key":"ref18","article-title":"Esresnet: Environmental sound classification based on visual domain models","author":"guzhov","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"journal-title":"Automated lip sync error correction","year":"2005","author":"baker","key":"ref7"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref4","first-page":"25","article-title":"On attention modules for audio-visual synchronization","author":"khosravan","year":"0","journal-title":"IEEE Workshop Comput Vis Pattern Recognit"},{"key":"ref3","first-page":"7763","article-title":"Cooperative learning of audio and video models from self-supervised synchronization","author":"korbar","year":"2018","journal-title":"Advances Neural Inf Process Syst"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1242\/jeb.173724"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-54427-4_19"}],"event":{"name":"ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","start":{"date-parts":[[2021,6,6]]},"location":"Toronto, ON, Canada","end":{"date-parts":[[2021,6,11]]}},"container-title":["ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9413349\/9413350\/09414924.pdf?arnumber=9414924","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,7,13]],"date-time":"2023-07-13T21:44:03Z","timestamp":1689284643000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9414924\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,6,6]]},"references-count":22,"URL":"https:\/\/doi.org\/10.1109\/icassp39728.2021.9414924","relation":{},"subject":[],"published":{"date-parts":[[2021,6,6]]}}}