{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,8]],"date-time":"2025-12-08T22:41:52Z","timestamp":1765233712088,"version":"3.28.0"},"reference-count":41,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,12,16]],"date-time":"2023-12-16T00:00:00Z","timestamp":1702684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,12,16]],"date-time":"2023-12-16T00:00:00Z","timestamp":1702684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,12,16]]},"DOI":"10.1109\/asru57964.2023.10389692","type":"proceedings-article","created":{"date-parts":[[2024,1,19]],"date-time":"2024-01-19T18:38:40Z","timestamp":1705689520000},"page":"1-8","source":"Crossref","is-referenced-by-count":5,"title":["Flap: Fast Language-Audio Pre-Training"],"prefix":"10.1109","author":[{"given":"Ching-Feng","family":"Yeh","sequence":"first","affiliation":[{"name":"FAIR, Meta"}]},{"given":"Po-Yao","family":"Huang","sequence":"additional","affiliation":[{"name":"FAIR, Meta"}]},{"given":"Vasu","family":"Sharma","sequence":"additional","affiliation":[{"name":"FAIR, Meta"}]},{"given":"Shang-Wen","family":"Li","sequence":"additional","affiliation":[{"name":"FAIR, Meta"}]},{"given":"Gargi","family":"Gosh","sequence":"additional","affiliation":[{"name":"FAIR, Meta"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Unsupervised feature learning and deep learning: A review and new perspectives","volume":"abs\/1206.5538","author":"Bengio","year":"2012","journal-title":"CoRR"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2021.3134634"},{"key":"ref3","article-title":"Discriminative unsupervised feature learning with convolutional neural networks","volume":"27","author":"Dosovitskiy","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00950"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"article-title":"BERT: pre-training of deep bidirectional transformers for language understanding","volume-title":"Proc. NAACL-HLT","author":"Devlin","key":"ref6"},{"key":"ref7","article-title":"RoBERTa: A robustly optimized BERT Pre-training approach","volume":"abs\/1907.11692","author":"Liu","year":"2019","journal-title":"CoRR"},{"key":"ref8","article-title":"Masked autoencoders that listen","author":"Huang","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.3390\/technologies9010002"},{"key":"ref11","article-title":"Learning transferable visual models from natural language supervision","volume":"abs\/2103.00020","author":"Radford","year":"2021","journal-title":"CoRR"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.544"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.195"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01039"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/icassp49357.2023.10095889"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/icassp49357.2023.10095969"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2006.100"},{"key":"ref18","article-title":"A simple framework for contrastive learning of visual representations","volume":"abs\/2002.05709","author":"Chen","year":"2020","journal-title":"CoRR"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr42600.2020.00975"},{"article-title":"Attention is all you need","volume-title":"Proc. NeurIPS","author":"Vaswani","key":"ref20"},{"key":"ref21","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","author":"Baevski","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref22","article-title":"MAViL: masked audio-video learners","author":"Huang","year":"2023","journal-title":"Advances in Neural Information Processing Systems"},{"article-title":"Wav2CLIP: learning robust audio representations from CLIP","volume-title":"Proc. ICASSP","author":"Ho-HsiangWu","key":"ref23"},{"key":"ref24","first-page":"10078","article-title":"VideoMAE: masked autoencoders are data-efficient learners for self-supervised video pre-training","volume":"35","author":"Tong","year":"2022","journal-title":"Advances in neural information processing systems"},{"key":"ref25","article-title":"Masked autoencoders as spatiotemporal learners","author":"Feichtenhofer","year":"2022","journal-title":"NeurIPS"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52729.2023.02240"},{"key":"ref27","article-title":"LLaMA: open and efficient foundation language models","author":"Touvron","year":"2023","journal-title":"arXiv preprint arXiv:2302.13971"},{"article-title":"Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality","year":"2023","author":"Chiang","key":"ref28"},{"article-title":"Stanford alpaca: An instruction-following llama model","year":"2023","author":"Taori","key":"ref29"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2023-1136"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3149712"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-11115"},{"key":"ref33","article-title":"Representation learning with contrastive predictive coding","author":"den Oord","year":"2018","journal-title":"arXiv preprint arXiv:1807.03748"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2019-2680"},{"key":"ref36","article-title":"Judging LLMas-a-judge with MT-bench and chatbot arena","volume":"abs\/2306.05685","author":"Zheng","year":"2023","journal-title":"CoRR"},{"key":"ref37","article-title":"LLaMA: open and efficient foundation language models","author":"Touvron","year":"2023","journal-title":"arXiv preprint arXiv:2302.13971"},{"article-title":"AudioCaps: generating captions for audios in the wild","volume-title":"Proc. NAACL-HLT","author":"Kim","key":"ref38"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052990"},{"key":"ref40","article-title":"fvcore: Collection of common code that\u2019s shared among different research projects in fair computer vision team"},{"article-title":"Adam: A method for stochastic optimization","volume-title":"Proc. ICLR","author":"Kingma","key":"ref41"}],"event":{"name":"2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","start":{"date-parts":[[2023,12,16]]},"location":"Taipei, Taiwan","end":{"date-parts":[[2023,12,20]]}},"container-title":["2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10388490\/10389614\/10389692.pdf?arnumber=10389692","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,23]],"date-time":"2024-01-23T16:36:09Z","timestamp":1706027769000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10389692\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12,16]]},"references-count":41,"URL":"https:\/\/doi.org\/10.1109\/asru57964.2023.10389692","relation":{},"subject":[],"published":{"date-parts":[[2023,12,16]]}}}