{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T19:37:33Z","timestamp":1730230653380,"version":"3.28.0"},"reference-count":24,"publisher":"IEEE","license":[{"start":{"date-parts":[[2022,5,23]],"date-time":"2022-05-23T00:00:00Z","timestamp":1653264000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,5,23]],"date-time":"2022-05-23T00:00:00Z","timestamp":1653264000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022,5,23]]},"DOI":"10.1109\/icassp43922.2022.9747361","type":"proceedings-article","created":{"date-parts":[[2022,4,27]],"date-time":"2022-04-27T19:50:34Z","timestamp":1651089034000},"page":"4728-4732","source":"Crossref","is-referenced-by-count":1,"title":["Enhancing Contrastive Learning with Temporal Cognizance for Audio-Visual Representation Generation"],"prefix":"10.1109","author":[{"given":"Chandrashekhar","family":"Lavania","sequence":"first","affiliation":[{"name":"Amazon"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shiva","family":"Sundaram","sequence":"additional","affiliation":[{"name":"Amazon"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sundararajan","family":"Srinivasan","sequence":"additional","affiliation":[{"name":"Amazon"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Katrin","family":"Kirchhoff","sequence":"additional","affiliation":[{"name":"Amazon"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01229"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00021"},{"key":"ref12","article-title":"Self-supervised learning by cross-modal audio-video clustering","volume":"33","author":"alwassel","year":"2020","journal-title":"NeurIPS"},{"key":"ref13","article-title":"See, hear, and read: Deep aligned representations","author":"aytar","year":"2017","journal-title":"arXiv preprint arXiv 1706 00932"},{"key":"ref14","article-title":"Cooperative learning of audio and video models from self-supervised synchronization","author":"korbar","year":"2018","journal-title":"arXiv preprint arXiv 1807 00230"},{"key":"ref15","article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"2018","journal-title":"arXiv preprint arXiv 1810 04805"},{"key":"ref16","first-page":"297","article-title":"Noise-contrastive estimation: A new estimation principle for unnormalized statistical models","author":"gutmann","year":"2010","journal-title":"Proceedings of the Thirteenth International Conference on Artificial Intelligence and Statistics"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"ref18","article-title":"Creating summaries from user videos","author":"gygli","year":"2014","journal-title":"ECCV"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00675"},{"key":"ref4","first-page":"609","article-title":"Look, listen and learn","author":"arandjelovic","year":"2017","journal-title":"ICCV"},{"key":"ref3","first-page":"251","article-title":"Out of time: automated lip sync in the wild","author":"chung","year":"2016","journal-title":"Asian Conference on Computer Vision"},{"key":"ref6","first-page":"631","article-title":"Audio-visual scene analysis with self-supervised multisensory features","author":"owens","year":"2018","journal-title":"ECCV"},{"key":"ref5","first-page":"435","article-title":"Objects that sound","author":"arandjelovic","year":"2018","journal-title":"ECCV"},{"key":"ref8","article-title":"Vatt: Transformers for multimodal self-supervised learning from raw video, audio and text","author":"akbari","year":"2021","journal-title":"arXiv preprint arXiv 2104 11178"},{"key":"ref7","first-page":"7","article-title":"Self-supervised multimodal versatile networks","volume":"2","author":"alayrac","year":"2020","journal-title":"NeurIPS"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1037\/h0077714"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383573"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413869"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33019143"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12255"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054476"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1145\/3395035.3425202"}],"event":{"name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","start":{"date-parts":[[2022,5,23]]},"location":"Singapore, Singapore","end":{"date-parts":[[2022,5,27]]}},"container-title":["ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9745891\/9746004\/09747361.pdf?arnumber=9747361","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,8,22]],"date-time":"2022-08-22T20:14:04Z","timestamp":1661199244000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9747361\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,5,23]]},"references-count":24,"URL":"https:\/\/doi.org\/10.1109\/icassp43922.2022.9747361","relation":{},"subject":[],"published":{"date-parts":[[2022,5,23]]}}}