{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,3]],"date-time":"2026-07-03T16:08:49Z","timestamp":1783094929603,"version":"3.54.6"},"publisher-location":"Singapore","reference-count":32,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819981830","type":"print"},{"value":"9789819981847","type":"electronic"}],"license":[{"start":{"date-parts":[[2023,11,26]],"date-time":"2023-11-26T00:00:00Z","timestamp":1700956800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,11,26]],"date-time":"2023-11-26T00:00:00Z","timestamp":1700956800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-981-99-8184-7_4","type":"book-chapter","created":{"date-parts":[[2023,11,25]],"date-time":"2023-11-25T07:02:41Z","timestamp":1700895761000},"page":"41-53","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Exploring Efficient-Tuned Learning Audio Representation Method from\u00a0BriVL"],"prefix":"10.1007","author":[{"given":"Sen","family":"Fang","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yangjian","family":"Wu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Bowen","family":"Gao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jingwen","family":"Cai","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Teik Toe","family":"Teoh","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2023,11,26]]},"reference":[{"key":"4_CR1","unstructured":"Baevski, A., Zhou, Y., Mohamed, A., Auli, M.: wav2vec 2.0: a framework for self-supervised learning of speech representations. Adv. Neural Inf. Process. Syst. 33, 12449\u201312460 (2020)"},{"key":"4_CR2","doi-asserted-by":"crossref","unstructured":"Chen, H., Xie, W., Vedaldi, A., Zisserman, A.: Vggsound: a large-scale audio-visual dataset. In: ICASSP, pp. 721\u2013725. IEEE (2020)","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"issue":"6","key":"4_CR3","doi-asserted-by":"publisher","first-page":"1505","DOI":"10.1109\/JSTSP.2022.3188113","volume":"16","author":"S Chen","year":"2022","unstructured":"Chen, S., et al.: WavLM: large-scale self-supervised pre-training for full stack speech processing. IEEE J. Sel. Top. Sig. Process. 16(6), 1505\u20131518 (2022)","journal-title":"IEEE J. Sel. Top. Sig. Process."},{"key":"4_CR4","doi-asserted-by":"crossref","unstructured":"Cramer, J., Wu, H.H., Salamon, J., Bello, J.P.: Look, listen, and learn more: Design choices for deep audio embeddings. In: ICASSP, pp. 3852\u20133856. IEEE (2019)","DOI":"10.1109\/ICASSP.2019.8682475"},{"key":"4_CR5","doi-asserted-by":"crossref","unstructured":"Cudeiro, D., Bolkart, T., Laidlaw, C., Ranjan, A., Black, M.J.: Capture, learning, and synthesis of 3d speaking styles. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10101\u201310111 (2019)","DOI":"10.1109\/CVPR.2019.01034"},{"key":"4_CR6","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"4_CR7","doi-asserted-by":"crossref","unstructured":"Drossos, K., Lipping, S., Virtanen, T.: Clotho: an audio captioning dataset. In: ICASSP, May 2020. https:\/\/arxiv.org\/abs\/1910.09387","DOI":"10.1109\/ICASSP40776.2020.9052990"},{"key":"4_CR8","doi-asserted-by":"crossref","unstructured":"Esser, P., Rombach, R., Ommer, B.: Taming transformers for high-resolution image synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12873\u201312883 (2021)","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"4_CR9","unstructured":"Fanzeres, L.A., Nadeu, C.: Sound-to-imagination: unsupervised crossmodal translation using deep dense network architecture. arXiv preprint arXiv:2106.01266 (2021)"},{"issue":"1","key":"4_CR10","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1038\/s41467-022-30761-2","volume":"13","author":"N Fei","year":"2022","unstructured":"Fei, N., et al.: Towards artificial general intelligence via a multimodal foundation model. Nat. Commun. 13(1), 1\u201313 (2022)","journal-title":"Nat. Commun."},{"key":"4_CR11","doi-asserted-by":"crossref","unstructured":"Guzhov, A., Raue, F., Hees, J., Dengel, A.: Audioclip: extending clip to image, text and audio. arXiv preprint arXiv:2106.13043 (2021)","DOI":"10.1109\/ICASSP43922.2022.9747631"},{"key":"4_CR12","doi-asserted-by":"publisher","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., Girshick, R.: Momentum contrast for unsupervised visual representation learning. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 9726\u20139735 (2020). https:\/\/doi.org\/10.1109\/CVPR42600.2020.00975","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"4_CR13","doi-asserted-by":"publisher","unstructured":"Ilharco, G., Zhang, Y., Baldridge, J.: Large-scale representation learning from visually grounded untranscribed speech. In: Proceedings of the 23rd Conference on Computational Natural Language Learning (CoNLL), pp. 55\u201365. Association for Computational Linguistics, Hong Kong, China, November 2019. https:\/\/doi.org\/10.18653\/v1\/K19-1006, https:\/\/aclanthology.org\/K19-1006","DOI":"10.18653\/v1\/K19-1006"},{"key":"4_CR14","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: International Conference on Machine Learning, pp. 4904\u20134916. PMLR (2021)"},{"key":"4_CR15","unstructured":"Kaplan, J., et al.: Scaling laws for neural language models. arXiv preprint arXiv:2001.08361 (2020)"},{"issue":"4","key":"4_CR16","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3072959.3073658","volume":"36","author":"T Karras","year":"2017","unstructured":"Karras, T., Aila, T., Laine, S., Herva, A., Lehtinen, J.: Audio-driven facial animation by joint end-to-end learning of pose and emotion. ACM Trans. Graph. (TOG) 36(4), 1\u201312 (2017)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"4_CR17","doi-asserted-by":"publisher","unstructured":"Kazakos, E., Nagrani, A., Zisserman, A., Damen, D.: Slow-fast auditory streams for audio recognition. In: ICASSP, pp. 855\u2013859 (2021). https:\/\/doi.org\/10.1109\/ICASSP39728.2021.9413376","DOI":"10.1109\/ICASSP39728.2021.9413376"},{"key":"4_CR18","unstructured":"Pedersoli, F., Wiebe, D., Banitalebi, A., Zhang, Y., Yi, K.M.: Estimating visual information from audio through manifold learning. arXiv preprint arXiv:2208.02337 (2022)"},{"key":"4_CR19","doi-asserted-by":"publisher","unstructured":"Piczak, K.J.: ESC: dataset for environmental sound classification. In: ACM Multimedia, p. 1015. ACM Press (2015). https:\/\/doi.org\/10.1145\/2733373.2806390, http:\/\/dl.acm.org\/citation.cfm?doid=2733373.2806390","DOI":"10.1145\/2733373.2806390"},{"key":"4_CR20","unstructured":"Qiu, Y., Kataoka, H.: Image generation associated with music data. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops, pp. 2510\u20132513 (2018)"},{"key":"4_CR21","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: ICML (2021)"},{"key":"4_CR22","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10684\u201310695, June 2022","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"4_CR23","doi-asserted-by":"crossref","unstructured":"Salamon, J., Jacoby, C., Bello, J.P.: A dataset and taxonomy for urban sound research. In: ACM Multimedia, pp. 1041\u20131044. Orlando, FL, USA, Nov 2014","DOI":"10.1145\/2647868.2655045"},{"key":"4_CR24","doi-asserted-by":"crossref","unstructured":"Sung-Bin, K., Senocak, A., Ha, H., Owens, A., Oh, T.H.: Sound to visual scene generation by audio-to-visual latent alignment (2023)","DOI":"10.1109\/CVPR52729.2023.00622"},{"key":"4_CR25","unstructured":"Tan, M., Le, Q.: EfficientNet: rethinking model scaling for convolutional neural networks. In: International Conference on Machine Learning, pp. 6105\u20136114. PMLR (2019)"},{"key":"4_CR26","doi-asserted-by":"crossref","unstructured":"Turpault, N., Serizel, R., Parag Shah, A., Salamon, J.: Sound event detection in domestic environments with weakly labeled data and soundscape synthesis. In: DCASE. New York City, United States, October 2019. https:\/\/hal.inria.fr\/hal-02160855","DOI":"10.33682\/006b-jx26"},{"key":"4_CR27","doi-asserted-by":"publisher","unstructured":"Wan, C.H., Chuang, S.P., Lee, H.Y.: Towards audio to scene image synthesis using generative adversarial network. In: ICASSP 2019\u20132019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 496\u2013500 (2019). https:\/\/doi.org\/10.1109\/ICASSP.2019.8682383","DOI":"10.1109\/ICASSP.2019.8682383"},{"key":"4_CR28","doi-asserted-by":"crossref","unstructured":"Wu, H.H., Seetharaman, P., Kumar, K., Bello, J.P.: Wav2clip: learning robust audio representations from clip. In: ICASSP 2022\u20132022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4563\u20134567. IEEE (2022)","DOI":"10.1109\/ICASSP43922.2022.9747669"},{"key":"4_CR29","doi-asserted-by":"crossref","unstructured":"Zhang, L., Agrawala, M.: Adding conditional control to text-to-image diffusion models (2023)","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"4_CR30","doi-asserted-by":"publisher","first-page":"93","DOI":"10.1016\/j.neucom.2022.03.015","volume":"490","author":"P Zhao","year":"2022","unstructured":"Zhao, P., Chen, Y., Zhao, L., Wu, G., Zhou, X.: Generating images from audio under semantic consistency. Neurocomputing 490, 93\u2013103 (2022)","journal-title":"Neurocomputing"},{"key":"4_CR31","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Wang, Z., Fang, C., Bui, T., Berg, T.L.: Visual to sound: generating natural sound for videos in the wild. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00374"},{"issue":"3","key":"4_CR32","doi-asserted-by":"publisher","first-page":"351","DOI":"10.1007\/s11633-021-1293-0","volume":"18","author":"H Zhu","year":"2021","unstructured":"Zhu, H., Luo, M.D., Wang, R., Zheng, A.H., He, R.: Deep audio-visual learning: A survey. Int. J. Autom. Comput. 18(3), 351\u2013376 (2021)","journal-title":"Int. J. Autom. Comput."}],"container-title":["Communications in Computer and Information Science","Neural Information Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-99-8184-7_4","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,25]],"date-time":"2024-03-25T06:03:08Z","timestamp":1711346588000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-99-8184-7_4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,11,26]]},"ISBN":["9789819981830","9789819981847"],"references-count":32,"URL":"https:\/\/doi.org\/10.1007\/978-981-99-8184-7_4","relation":{},"ISSN":["1865-0929","1865-0937"],"issn-type":[{"value":"1865-0929","type":"print"},{"value":"1865-0937","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,11,26]]},"assertion":[{"value":"26 November 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICONIP","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Neural Information Processing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Changsha","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"20 November 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 November 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"30","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"iconip2023","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/iconip2023.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Single-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"EasyChair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1274","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"650","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"51% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4.14","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"2.46","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}