{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,13]],"date-time":"2025-05-13T18:06:29Z","timestamp":1747159589717},"reference-count":46,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Artif Intell Rev"],"published-print":{"date-parts":[[2024,1]]},"DOI":"10.1007\/s10462-023-10625-x","type":"journal-article","created":{"date-parts":[[2024,1,6]],"date-time":"2024-01-06T07:01:33Z","timestamp":1704524493000},"update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Acoustic-based LEGO recognition using attention-based convolutional neural networks"],"prefix":"10.1007","volume":"57","author":[{"given":"Van-Thuan","family":"Tran","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chia-Yang","family":"Wu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wei-Ho","family":"Tsai","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,1,6]]},"reference":[{"key":"10625_CR1","doi-asserted-by":"crossref","unstructured":"Aytar Y, Vondrick C, Torralba A (2016) SoundNet: learning sound representations from unlabeled video. In: Proceedings of the 30th International Conference on Neural Information Processing Systems. pp 892\u2013900","DOI":"10.1109\/CVPR.2016.18"},{"key":"10625_CR2","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2004.10934","author":"A Bochkovskiy","year":"2020","unstructured":"Bochkovskiy A, Wang C-Y, Liao H-YM (2020) YOLOv4 optimal speed and accuracy of object detection. arXiv:200410934\u00a0 https:\/\/doi.org\/10.48550\/arXiv.2004.10934","journal-title":"arXiv.:2004.10934\u00a0"},{"key":"10625_CR3","doi-asserted-by":"publisher","first-page":"2048","DOI":"10.1016\/j.procs.2017.08.250","volume":"112","author":"V Boddapati","year":"2017","unstructured":"Boddapati V, Petef A, Rasmusson J, Lars L (2017) Classifying environmental sounds using image recognition networks. Procedia Comput Sci 112:2048\u20132056","journal-title":"Procedia Comput Sci"},{"key":"10625_CR4","doi-asserted-by":"publisher","first-page":"140","DOI":"10.1016\/J.COMPAG.2012.04.014","volume":"85","author":"J Buerano","year":"2012","unstructured":"Buerano J, Zalameda J, Ruiz RS (2012) Microphone system optimization for free fall impact acoustic method in detection of rice kernel damage. Comput Electron Agric 85:140\u2013148. https:\/\/doi.org\/10.1016\/J.COMPAG.2012.04.014","journal-title":"Comput Electron Agric"},{"key":"10625_CR5","doi-asserted-by":"publisher","first-page":"12074","DOI":"10.1109\/TITS.2021.3109632","volume":"23","author":"Y Cao","year":"2022","unstructured":"Cao Y, Sun Y, Xie G, Li P (2022) A Sound-Based Fault Diagnosis Method for Railway Point Machines Based on Two-Stage Feature Selection Strategy and Ensemble Classifier. IEEE Trans Intell Transp Syst 23:12074\u201312083. https:\/\/doi.org\/10.1109\/TITS.2021.3109632","journal-title":"IEEE Trans Intell Transp Syst"},{"key":"10625_CR6","unstructured":"Clarke S, Rhodes T, Atkeson CG, Kroemer O (2018) Learning audio feedback for estimating amount and flow of granular material. In: Billard A, Dragan A, Peters J, Morimoto J (eds) Proceedings of the 2nd conference on robot learning. PMLR, pp 529\u2013550"},{"key":"10625_CR7","unstructured":"Colangelo F, Battisti F, Neri A, Carli M (2018) Convolutional recurrent neural network for audio events classification. In: Detect. Classif. Acoust. Scenes Events Chall. 2018. https:\/\/dcase.community\/documents\/challenge2018\/technical_reports\/DCASE2018_Colangelo_61.pdf. Accessed 12 Sep 2022"},{"key":"10625_CR8","doi-asserted-by":"crossref","unstructured":"Dai W, Dai C, Qu S, et al (2017) Very deep convolutional neural networks for raw waveforms. In: Proc. of IEEE International Conference on Acoustics, Speech and Signal Processing. Institute of Electrical and Electronics Engineers Inc., pp 421\u2013425","DOI":"10.1109\/ICASSP.2017.7952190"},{"key":"10625_CR9","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/J.ASOC.2021.107465","volume":"108","author":"R Espinosa","year":"2021","unstructured":"Espinosa R, Ponce H, Guti\u00e9rrez S (2021) Click-event sound detection in automotive industry using machine\/deep learning. Appl Soft Comput 108:1\u201312. https:\/\/doi.org\/10.1016\/J.ASOC.2021.107465","journal-title":"Appl Soft Comput"},{"key":"10625_CR10","doi-asserted-by":"crossref","unstructured":"Gandhi D, Gupta A, Pinto L (2020) Swoosh! rattle! thump!--actions that sound. In: Robotics: Science and Systems 2020. pp 1\u201310","DOI":"10.15607\/RSS.2020.XVI.002"},{"key":"10625_CR11","doi-asserted-by":"publisher","first-page":"54","DOI":"10.1109\/TAMD.2011.2157504","volume":"4","author":"S Griffith","year":"2012","unstructured":"Griffith S, Sinapov J, Sukhoy V, Stoytchev A (2012a) A behavior-grounded approach to forming object categories: Separating containers from noncontainers. IEEE Trans Auton Ment Dev 4:54\u201369. https:\/\/doi.org\/10.1109\/TAMD.2011.2157504","journal-title":"IEEE Trans Auton Ment Dev"},{"key":"10625_CR12","unstructured":"Griffith S, Sukhoy V, Wegter T, Stoytchev A (2012b) Object categorization in the sink\u202f: learning behavior\u2014grounded object categories with water. In: Proceedings of the 2012 ICRA Workshop on Semantic Perception, Mapping and Exploration. pp 1\u20136"},{"key":"10625_CR13","doi-asserted-by":"crossref","unstructured":"Guo J, Xu N, Li L-J, Alwan A (2017) Attention based CLDNNs for short-duration acoustic scene classification. In: INTERSPEECH. pp 469\u2013473","DOI":"10.21437\/Interspeech.2017-440"},{"key":"10625_CR14","doi-asserted-by":"crossref","unstructured":"Hassan SU, Zeeshan Khan M, Ghani Khan MU, Saleem S (2019) Robust sound classification for surveillance using time frequency audio features. In: Proceeding of International Conference on Communication Technologies (ComTech). pp 13\u201318","DOI":"10.1109\/COMTECH.2019.8737801"},{"key":"10625_CR15","doi-asserted-by":"crossref","unstructured":"Henze D, Gorishti K, Bruegge B, Simen J-P (2019) AudioForesight: A process model for audio predictive maintenance in industrial environments. In: 2019 18th IEEE International Conference On Machine Learning And Applications (ICMLA). pp 352\u2013357","DOI":"10.1109\/ICMLA.2019.00066"},{"key":"10625_CR16","unstructured":"Ioffe S, Szegedy C (2015) Batch normalization: accelerating deep network training by reducing internal covariate shift. In: Proceedings of the 32nd International Conference on International Conference on Machine Learning. pp 448\u2013456"},{"key":"10625_CR17","doi-asserted-by":"crossref","unstructured":"Kim G, Han DK, Ko H (2021) SpecMix\u202f: a mixed sample data augmentation method for training withtime-frequency domain features. In: Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH. International Speech Communication Association, pp 6\u201310","DOI":"10.21437\/Interspeech.2021-103"},{"key":"10625_CR18","unstructured":"Kingma DP, Ba JL (2015) Adam: a method for stochastic optimization. In: Proceeding of International Conference for Learning Representations. pp 1\u201315"},{"key":"10625_CR19","doi-asserted-by":"crossref","unstructured":"Ko T, Peddinti V, Povey D, Khudanpur S (2015) Audio augmentation for speech recognition. In: Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH. International Speech and Communication Association, pp 3586\u20133589","DOI":"10.21437\/Interspeech.2015-711"},{"key":"10625_CR20","first-page":"1097","volume-title":"Advances in neural information processing systems","author":"A Krizhevsky","year":"2012","unstructured":"Krizhevsky A, Sutskever I, Hinton GE (2012) Imagenet classification with deep convolutional neural networks. In: Pereira F, Burges CJC, Bottou L, Weinberger KQ (eds) Advances in neural information processing systems. Curran Associates, New York, pp 1097\u20131105"},{"key":"10625_CR21","doi-asserted-by":"crossref","unstructured":"Lezhenin I, Bogach N, Pyshkin E (2019) Urban sound classification using long short-term memory neural network. In: Proceedings of the 2019 Federated Conference on Computer Science and Information Systems, FedCSIS 2019. Institute of Electrical and Electronics Engineers, pp 57\u201360","DOI":"10.15439\/2019F185"},{"key":"10625_CR22","doi-asserted-by":"crossref","unstructured":"Li X, Chebiyyam V, Kirchhoff K (2019) Multi-stream network with temporal attention for environmental sound classification. In: INTERSPEECH. pp 3604\u20133608","DOI":"10.21437\/Interspeech.2019-3019"},{"key":"10625_CR23","doi-asserted-by":"publisher","first-page":"21","DOI":"10.1007\/978-3-319-46448-0_2","volume-title":"Computer vision\u2014ECCV 2016","author":"W Liu","year":"2016","unstructured":"Liu W, Anguelov D, Erhan D et al (2016) SSD: single shot multibox detector. In: Leibe B, Matas J, Sebe N, Welling M (eds) Computer vision\u2014ECCV 2016. Springer, Berlin, pp 21\u201337"},{"key":"10625_CR24","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1177\/1729881417714996","volume":"14","author":"E Lopez-Caudana","year":"2017","unstructured":"Lopez-Caudana E, Quiroz O, Rodr\u00edguez A et al (2017) Classification of materials by acoustic signal processing in real time for NAO robots. Int J Adv Robot Syst 14:1\u201310","journal-title":"Int J Adv Robot Syst"},{"key":"10625_CR25","doi-asserted-by":"crossref","unstructured":"Mcfee B, Raffel C, Liang D, et al (2015) librosa: audio and music signal analysis in python. In: Proceeding of the 14th Python in Science Conference. pp 18\u201325","DOI":"10.25080\/Majora-7b98e3ed-003"},{"key":"10625_CR26","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/j.apacoust.2020.107389","volume":"167","author":"Z Mushtaq","year":"2020","unstructured":"Mushtaq Z, Su S-F (2020) Environmental sound classification using a regularized deep convolutional neural network with data augmentation. Appl Acoust 167:1\u201313","journal-title":"Appl Acoust"},{"key":"10625_CR27","unstructured":"Nakamura T, Nagai T, Iwahashi N (2007) Multimodal object categorization by a robot. In: IEEE International Conference on Intelligent Robots and Systems. pp 2415\u20132420"},{"key":"10625_CR28","doi-asserted-by":"crossref","unstructured":"Park DS, Chan W, Zhang Y, et al (2019) SpecAugment: a simple data augmentation method for automatic speech recognition. In: INTERSPEECH 2019. pp 2613\u20132617","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"10625_CR29","doi-asserted-by":"publisher","first-page":"1137","DOI":"10.1109\/TPAMI.2016.2577031","volume":"39","author":"S Ren","year":"2017","unstructured":"Ren S, He K, Girshick R, Sun J (2017) Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks. IEEE Trans Pattern Anal Mach Intell 39:1137\u20131149","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"10625_CR30","doi-asserted-by":"publisher","first-page":"279","DOI":"10.1109\/LSP.2017.2657381","volume":"24","author":"J Salamon","year":"2017","unstructured":"Salamon J, Bello JP (2017) Deep Convolutional Neural Networks and Data Augmentation for Environmental Sound Classification. IEEE Signal Process Lett 24:279\u2013283","journal-title":"IEEE Signal Process Lett"},{"key":"10625_CR31","doi-asserted-by":"publisher","first-page":"208","DOI":"10.1007\/978-3-642-34898-3_14","volume-title":"Proceeding of ambient intelligence","author":"MA Sehili","year":"2012","unstructured":"Sehili MA, Lecouteux B, Vacher M et al (2012) Sound Environment analysis in smart home. In: Patern\u00f2 F, de Ruyter B, Markopoulos P, Santoro C, van Loenen E, Luyten K (eds) Proceeding of ambient intelligence. Springer, Berlin, pp 208\u2013223"},{"key":"10625_CR32","unstructured":"Simonyan K, Zisserman A (2015) Very Deep Convolutional Networks for Large-Scale Image Recognition.\u00a0arXiv preprint arXiv:1409.1556. 1\u201314"},{"key":"10625_CR33","doi-asserted-by":"crossref","unstructured":"Sinapov J, Wiemer M, Stoytchev A (2009) Interactive learning of the acoustic properties of household objects. In: Proceedings\u2014IEEE International Conference on Robotics and Automation. pp 2518\u20132524","DOI":"10.1109\/ROBOT.2009.5152802"},{"key":"10625_CR34","first-page":"1929","volume":"15","author":"N Srivastava","year":"2014","unstructured":"Srivastava N, Hinton GE, Krizhevsky A et al (2014) Dropout: a simple way to prevent neural networks from overfitting. J Mach Learn Res 15:1929\u20131958","journal-title":"J Mach Learn Res"},{"key":"10625_CR35","doi-asserted-by":"publisher","first-page":"3978","DOI":"10.1109\/TMM.2020.3035275","volume":"23","author":"OK Toffa","year":"2021","unstructured":"Toffa OK, Mignotte M (2021) Environmental sound classification using local binary pattern and audio features collaboration. IEEE Trans Multimed 23:3978\u20133985. https:\/\/doi.org\/10.1109\/TMM.2020.3035275","journal-title":"IEEE Trans Multimed"},{"key":"10625_CR36","doi-asserted-by":"crossref","unstructured":"Tokozume Y, Harada T (2017) Learning environmental sounds with end-to-end convolutional neural network. In: Proc. IEEE International Conference on Acoustics, Speech and Signal Processing. Institute of Electrical and Electronics Engineers. pp 2721\u20132725","DOI":"10.1109\/ICASSP.2017.7952651"},{"key":"10625_CR37","doi-asserted-by":"publisher","first-page":"1556","DOI":"10.1109\/TASL.2010.2093519","volume":"19","author":"HD Tran","year":"2011","unstructured":"Tran HD, Li H (2011) Sound event recognition with probabilistic distance SVMs. IEEE Trans Audio, Speech Lang Process 19:1556\u20131568. https:\/\/doi.org\/10.1109\/TASL.2010.2093519","journal-title":"IEEE Trans Audio, Speech Lang Process"},{"key":"10625_CR38","doi-asserted-by":"publisher","first-page":"75702","DOI":"10.1109\/ACCESS.2020.2988986","volume":"8","author":"VT Tran","year":"2020","unstructured":"Tran VT, Tsai WH (2020) Acoustic-Based Emergency Vehicle Detection Using Convolutional Neural Networks. IEEE Access 8:75702\u201375713","journal-title":"IEEE Access"},{"key":"10625_CR39","doi-asserted-by":"publisher","first-page":"27905","DOI":"10.1109\/JSEN.2021.3127893","volume":"21","author":"VT Tran","year":"2021","unstructured":"Tran VT, Tsai WH (2021) Audio-Vision Emergency Vehicle Detection. IEEE Sens J 21:27905\u201327917","journal-title":"IEEE Sens J"},{"key":"10625_CR40","doi-asserted-by":"publisher","first-page":"4453","DOI":"10.3390\/S22124453","volume":"22","author":"V-T Tran","year":"2022","unstructured":"Tran V-T, Tsai W-H, Furletov Y, Gorodnichev M (2022) End-to-End Train Horn Detection for Railway Transit Safety. Sensors 22:4453. https:\/\/doi.org\/10.3390\/S22124453","journal-title":"Sensors"},{"key":"10625_CR41","doi-asserted-by":"publisher","first-page":"1827","DOI":"10.1111\/COIN.12468","volume":"37","author":"E Tsalera","year":"2021","unstructured":"Tsalera E, Papadakis A, Samarakou M (2021) Novel principal component analysis-based feature selection mechanism for classroom sound classification. Comput Intell 37:1827\u20131843. https:\/\/doi.org\/10.1111\/COIN.12468","journal-title":"Comput Intell"},{"key":"10625_CR42","doi-asserted-by":"crossref","unstructured":"Xu K, Feng D, Mi H, et al (2018) Mixup-based acoustic scene classification using multi-channel convolutional neural network. In: Advances in Multimedia Information Processing\u2014PCM 2018. pp 14\u201323","DOI":"10.1007\/978-3-030-00764-5_2"},{"key":"10625_CR43","doi-asserted-by":"crossref","unstructured":"Yun S, Han D, Chun S, et al (2019) CutMix: regularization strategy to train strong classifiers with localizable features. In: 2019 IEEE\/CVF International Conference on Computer Vision (ICCV). IEEE Computer Society, pp 6022\u20136031","DOI":"10.1109\/ICCV.2019.00612"},{"key":"10625_CR44","unstructured":"Zhang H, Cisse M, Dauphin YN, Lopez-Paz D (2018a) mixup: beyond empirical risk minimization. In: 6th International Conference on Learning Representations, ICLR 2018. pp 1\u201313"},{"key":"10625_CR45","doi-asserted-by":"publisher","first-page":"2973","DOI":"10.1109\/TII.2017.2775218","volume":"14","author":"Z Zhang","year":"2018","unstructured":"Zhang Z, Wen G, Chen S (2018b) Audible Sound-Based Intelligent Evaluation for Aluminum Alloy in Robotic Pulsed GTAW: Mechanism, Feature Selection, and Defect Detection. IEEE Trans Ind Informatics 14:2973\u20132983. https:\/\/doi.org\/10.1109\/TII.2017.2775218","journal-title":"IEEE Trans Ind Informatics"},{"key":"10625_CR46","doi-asserted-by":"publisher","first-page":"896","DOI":"10.1016\/j.neucom.2020.08.069","volume":"453","author":"Z Zhang","year":"2021","unstructured":"Zhang Z, Xu S, Zhang S et al (2021) Attention based convolutional recurrent neural network for environmental sound classification. Neurocomputing 453:896\u2013903","journal-title":"Neurocomputing"}],"container-title":["Artificial Intelligence Review"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10462-023-10625-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10462-023-10625-x\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10462-023-10625-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,2,15]],"date-time":"2024-02-15T16:19:10Z","timestamp":1708013950000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10462-023-10625-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,1]]},"references-count":46,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2024,1]]}},"alternative-id":["10625"],"URL":"https:\/\/doi.org\/10.1007\/s10462-023-10625-x","relation":{},"ISSN":["0269-2821","1573-7462"],"issn-type":[{"value":"0269-2821","type":"print"},{"value":"1573-7462","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,1]]},"assertion":[{"value":"17 December 2023","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 January 2024","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interests"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}],"article-number":"10"}}