{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T05:59:13Z","timestamp":1777528753657,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":28,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,2,2]],"date-time":"2024-02-02T00:00:00Z","timestamp":1706832000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,2,2]]},"DOI":"10.1145\/3651671.3651733","type":"proceedings-article","created":{"date-parts":[[2024,6,7]],"date-time":"2024-06-07T18:55:50Z","timestamp":1717786550000},"page":"665-669","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Evaluation of Environmental Sound Classification using Vision Transformer"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-2882-6288","authenticated-orcid":false,"given":"Changlong","family":"Wang","sequence":"first","affiliation":[{"name":"School of Engineering, Tohoku University, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8835-7877","authenticated-orcid":false,"given":"Akinori","family":"Ito","sequence":"additional","affiliation":[{"name":"School of Engineering, Tohoku University, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2278-0429","authenticated-orcid":false,"given":"Takashi","family":"Nose","sequence":"additional","affiliation":[{"name":"School of Engineering, Tohoku University, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0151-2556","authenticated-orcid":false,"given":"Chia-Ping","family":"Chen","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, National Sun Yat-Sen University, China (Taiwan)"}]}],"member":"320","published-online":{"date-parts":[[2024,6,7]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.iswa.2022.200115"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01393"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.procs.2017.08.250"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746312"},{"key":"e_1_3_2_1_5_1","volume-title":"Beats: Audio pre-training with acoustic tokenizers. arXiv preprint arXiv:2212.09058","author":"Chen Sanyuan","year":"2022","unstructured":"Sanyuan Chen, Yu Wu, Chengyi Wang, Shujie Liu, Daniel Tompkins, Zhuo Chen, and Furu Wei. 2022. Beats: Audio pre-training with acoustic tokenizers. arXiv preprint arXiv:2212.09058 (2022)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952190"},{"key":"e_1_3_2_1_7_1","volume-title":"An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095889"},{"key":"e_1_3_2_1_9_1","volume-title":"Natural language supervision for general-purpose audio representations. arXiv preprint arXiv:2309.05767","author":"Elizalde Benjamin","year":"2023","unstructured":"Benjamin Elizalde, Soham Deshmukh, and Huaming Wang. 2023. Natural language supervision for general-purpose audio representations. arXiv preprint arXiv:2309.05767 (2023)."},{"key":"e_1_3_2_1_10_1","volume-title":"End-to-end audio strikes back: Boosting augmentations towards an efficient audio classification network. arXiv preprint arXiv:2204.11479","author":"Gazneli Avi","year":"2022","unstructured":"Avi Gazneli, Gadi Zimerman, Tal Ridnik, Gilad Sharir, and Asaf Noy. 2022. End-to-end audio strikes back: Boosting augmentations towards an efficient audio classification network. arXiv preprint arXiv:2204.11479 (2022)."},{"key":"e_1_3_2_1_11_1","volume-title":"Ast: Audio spectrogram transformer. arXiv preprint arXiv:2104.01778","author":"Gong Yuan","year":"2021","unstructured":"Yuan Gong, Yu-An Chung, and James Glass. 2021. Ast: Audio spectrogram transformer. arXiv preprint arXiv:2104.01778 (2021)."},{"key":"e_1_3_2_1_12_1","volume-title":"Aclnet: efficient end-to-end audio classification cnn. arXiv preprint arXiv:1811.06669","author":"Huang J","year":"2018","unstructured":"Jonathan\u00a0J Huang and Juan Jose\u00a0Alvarado Leanos. 2018. Aclnet: efficient end-to-end audio classification cnn. arXiv preprint arXiv:1811.06669 (2018)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462200"},{"key":"e_1_3_2_1_14_1","volume-title":"CAT: Causal Audio Transformer for Audio Classification. In ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 1\u20135.","author":"Liu Xiaoyu","year":"2023","unstructured":"Xiaoyu Liu, Hanlin Lu, Jianbo Yuan, and Xinyu Li. 2023. CAT: Causal Audio Transformer for Audio Classification. In ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 1\u20135."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414229"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0214168"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2021.3090678"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.apacoust.2016.06.010"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/MLSP.2015.7324337"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/2733373.2806390"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"Hardik\u00a0B Sailor Dharmesh\u00a0M Agrawal and Hemant\u00a0A Patil. 2017. Unsupervised Filterbank Learning Using Convolutional Restricted Boltzmann Machine for Environmental Sound Classification.. In InterSpeech Vol.\u00a08. 9.","DOI":"10.21437\/Interspeech.2017-831"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2017.2657381"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-69900-4_40"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952651"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.5555\/353629.353648"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","unstructured":"Ross Wightman. 2019. PyTorch Image Models. https:\/\/github.com\/rwightman\/pytorch-image-models. https:\/\/doi.org\/10.5281\/zenodo.4414861","DOI":"10.5281\/zenodo.4414861"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.23919\/Eusipco47968.2020.9287705"},{"key":"e_1_3_2_1_28_1","volume-title":"Connecting the Dots between Audio and Text without Parallel Data through Visual Knowledge Transfer. ArXiv abs\/2112.08995","author":"Zhao Yanpeng","year":"2021","unstructured":"Yanpeng Zhao, Jack Hessel, Youngjae Yu, Ximing Lu, Rowan Zellers, and Yejin Choi. 2021. Connecting the Dots between Audio and Text without Parallel Data through Visual Knowledge Transfer. ArXiv abs\/2112.08995 (2021). https:\/\/api.semanticscholar.org\/CorpusID:245218746"}],"event":{"name":"ICMLC 2024: 2024 16th International Conference on Machine Learning and Computing","location":"Shenzhen China","acronym":"ICMLC 2024"},"container-title":["Proceedings of the 2024 16th International Conference on Machine Learning and Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3651671.3651733","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3651671.3651733","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T11:19:18Z","timestamp":1755861558000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3651671.3651733"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,2,2]]},"references-count":28,"alternative-id":["10.1145\/3651671.3651733","10.1145\/3651671"],"URL":"https:\/\/doi.org\/10.1145\/3651671.3651733","relation":{},"subject":[],"published":{"date-parts":[[2024,2,2]]},"assertion":[{"value":"2024-06-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}