{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,14]],"date-time":"2026-04-14T16:19:55Z","timestamp":1776183595631,"version":"3.50.1"},"reference-count":38,"publisher":"IEEE","license":[{"start":{"date-parts":[[2022,9,26]],"date-time":"2022-09-26T00:00:00Z","timestamp":1664150400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,9,26]],"date-time":"2022-09-26T00:00:00Z","timestamp":1664150400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022,9,26]]},"DOI":"10.1109\/mmsp55362.2022.9949447","type":"proceedings-article","created":{"date-parts":[[2022,11,22]],"date-time":"2022-11-22T21:39:16Z","timestamp":1669153156000},"page":"1-6","source":"Crossref","is-referenced-by-count":4,"title":["Audio-visual scene classification via contrastive event-object alignment and semantic-based fusion"],"prefix":"10.1109","author":[{"given":"Yuanbo","family":"Hou","sequence":"first","affiliation":[{"name":"Ghent University,WAVES Research Group,Gent,Belgium"}]},{"given":"Bo","family":"Kang","sequence":"additional","affiliation":[{"name":"Ghent University,IDLAB,Gent,Belgium"}]},{"given":"Dick","family":"Botteldooren","sequence":"additional","affiliation":[{"name":"Ghent University,WAVES Research Group,Gent,Belgium"}]}],"member":"263","reference":[{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.21105\/joss.00861"},{"key":"ref33","article-title":"Bit submission for DCASE 2020 challenge task1","author":"wang","year":"2021","journal-title":"Tech Rep DCASE2016 Challenge"},{"key":"ref32","article-title":"A multimodal wavetrans-former architecture conditioned on openl3 embeddings for audio-visual scene classification","author":"triantafyllopoulos","year":"2021","journal-title":"Tech Rep DCASE2016 Challenge"},{"key":"ref31","first-page":"217","article-title":"DCASE 2018 challenge surrey cross-task convolutional neural network baseline","author":"kong","year":"2018","journal-title":"Proc of DCASE 2018 Workshop"},{"key":"ref30","first-page":"1929","article-title":"Dropout: a simple way to prevent neural networks from overfitting","volume":"15","author":"srivastava","year":"2014","journal-title":"The Journal of Machine Learning Research"},{"key":"ref37","article-title":"Audio-visual scene classification using transfer learning and hybrid fusion strategy","author":"wang","year":"2021","journal-title":"Tech Rep DCASE2016 Challenge"},{"key":"ref36","article-title":"Scene classification using acoustic and visual feature","author":"yang","year":"2021","journal-title":"Tech Rep DCASE2016 Challenge"},{"key":"ref35","article-title":"A model ensemble approach for audio-visual scene classification","author":"wang","year":"2021","journal-title":"Tech Rep DCASE2016 Challenge"},{"key":"ref34","article-title":"CNN-based dual-stream network for audio-visual scene classification","author":"hou","year":"2021","journal-title":"Tech Rep DCASE2016 Challenge"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2017.09.025"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01252-6_28"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6247815"},{"key":"ref13","first-page":"788","article-title":"Audio-visual scene classification: Analysis of DCASE 2021 Challenge submissions","volume":"38","author":"wang","year":"2021","journal-title":"Proc DCAS Workshop"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-658-36295-9_6"},{"key":"ref15","first-page":"16","article-title":"Squeeze-excitation convolutional recurrent neural networks for audio-visual scene classification","author":"naranjo-alcazar","year":"2021","journal-title":"Proc DCAS Workshop"},{"key":"ref16","first-page":"95","article-title":"A multi-modal fusion approach for audio-visual scene classification enhanced by clip variants","author":"okazaki","year":"2021","journal-title":"Proc DCAS Workshop"},{"key":"ref17","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"radford","year":"2021","journal-title":"ICML"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413528"},{"key":"ref19","doi-asserted-by":"crossref","first-page":"571","DOI":"10.21437\/Interspeech.2021-698","article-title":"AST: Audio Spectrogram Trans-former","author":"gong","year":"2021","journal-title":"Proc of Interspeech"},{"key":"ref28","first-page":"248","article-title":"Imagenet: A large-scale hierarchical image database","author":"deng","year":"2009","journal-title":"CVPR"},{"key":"ref4","article-title":"A modular audio-visual scene analysis and attention system for humanoid robots","author":"k\u00fchn","year":"2012","journal-title":"Proc 43rd Int Symp Robotics (ISR)"},{"key":"ref27","first-page":"776","article-title":"Audio set: An ontology and human-labeled dataset for audio events","author":"gemmeke","year":"2017","journal-title":"ICASSP"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2015.7353973"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/EUSIPCO.2016.7760424"},{"key":"ref29","article-title":"Decoupled weight decay regularization","author":"ilya","year":"2019","journal-title":"ICLRE"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2014.2326181"},{"key":"ref8","first-page":"117","article-title":"Learning multiscale deep features for high-resolution satellite image scene classification","volume":"56","author":"liu","year":"2017","journal-title":"IEEE Transactions on GRS"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2017.2675998"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1145\/2072298.2072411"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN55064.2022.9892893"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2006.886263"},{"key":"ref20","first-page":"735","article-title":"Voice command recognition system based on MFCC and DTW","volume":"2","author":"bala","year":"2010","journal-title":"IJEST"},{"key":"ref22","first-page":"10347","article-title":"Training data-efficient image transformers & distillation through attention","author":"touvron","year":"2021","journal-title":"ICML"},{"key":"ref21","first-page":"5998","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"NIPS"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413711"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9415085"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR48806.2021.9413056"}],"event":{"name":"2022 IEEE 24th International Workshop on Multimedia Signal Processing (MMSP)","location":"Shanghai, China","start":{"date-parts":[[2022,9,26]]},"end":{"date-parts":[[2022,9,28]]}},"container-title":["2022 IEEE 24th International Workshop on Multimedia Signal Processing (MMSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9948698\/9948704\/09949447.pdf?arnumber=9949447","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,9]],"date-time":"2024-10-09T10:10:44Z","timestamp":1728468644000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9949447\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,9,26]]},"references-count":38,"URL":"https:\/\/doi.org\/10.1109\/mmsp55362.2022.9949447","relation":{},"subject":[],"published":{"date-parts":[[2022,9,26]]}}}