{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,2]],"date-time":"2026-06-02T22:00:44Z","timestamp":1780437644109,"version":"3.54.1"},"reference-count":40,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Expert Systems with Applications"],"published-print":{"date-parts":[[2026,11]]},"DOI":"10.1016\/j.eswa.2026.132951","type":"journal-article","created":{"date-parts":[[2026,5,22]],"date-time":"2026-05-22T06:58:01Z","timestamp":1779433081000},"page":"132951","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["TIAMO: Text-image-audio multimodal model for respiratory sound classification"],"prefix":"10.1016","volume":"329","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-0431-9161","authenticated-orcid":false,"given":"Kim-Ngoc T.","family":"Le","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-3654-2577","authenticated-orcid":false,"given":"Duc-Toan","family":"Nguyen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5286-6629","authenticated-orcid":false,"given":"Duc-Tai","family":"Le","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9063-8045","authenticated-orcid":false,"given":"Min Young","family":"Chung","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2692-6883","authenticated-orcid":false,"given":"Moonseong","family":"Kim","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6485-3155","authenticated-orcid":false,"given":"Hyunseung","family":"Choo","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"issue":"3","key":"10.1016\/j.eswa.2026.132951_bib0001","first-page":"535","article-title":"Deep neural network for respiratory sound classification in wearable devices enabled by patient specific model tuning","volume":"14","author":"Acharya","year":"2020","journal-title":"IEEE Transactions on Biomedical Circuits and Systems"},{"key":"10.1016\/j.eswa.2026.132951_bib0002","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1186\/s13640-017-0213-2","article-title":"Classification of lung sounds using convolutional neural networks","volume":"2017","author":"Aykanat","year":"2017","journal-title":"EURASIP Journal on Image and Video Processing"},{"key":"10.1016\/j.eswa.2026.132951_bib0003","doi-asserted-by":"crossref","unstructured":"Bae, S., Kim, J.-W., Cho, W.-Y., Baek, H., Son, S., Lee, B., Ha, C., Tae, K., Kim, S., & Yun, S.-Y. (2023). Patch-mix contrastive learning with audio spectrogram transformer on respiratory sound classification. arXiv: 2305.14032.","DOI":"10.21437\/Interspeech.2023-1426"},{"key":"10.1016\/j.eswa.2026.132951_bib0004","unstructured":"Baevski, A., Schneider, S., & Auli, M. (2019). VQ-WAV2VEC: Self-supervised learning of discrete speech representations. arXiv: 1910.05453."},{"key":"10.1016\/j.eswa.2026.132951_bib0005","first-page":"12449","article-title":"WAV2VEC 2.0: A framework for self-supervised learning of speech representations","volume":"33","author":"Baevski","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.eswa.2026.132951_bib0006","doi-asserted-by":"crossref","first-page":"58","DOI":"10.1016\/j.artmed.2018.04.008","article-title":"Lung sounds classification using convolutional neural networks","volume":"88","author":"Bardou","year":"2018","journal-title":"Artificial Intelligence in Medicine"},{"key":"10.1016\/j.eswa.2026.132951_bib0007","unstructured":"Based Shuvo, S., & Hasan, T. (2025). A multi-stage hybrid CNN-transformer network for automated pediatric lung sound classification. arXiv e-prints arXiv\u20132507."},{"key":"10.1016\/j.eswa.2026.132951_bib0008","series-title":"ICASSP 2022-2022 IEEE international conference on acoustics, speech and signal processing (ICASSP)","first-page":"646","article-title":"HTS-At: A hierarchical token-semantic audio transformer for sound classification and detection","author":"Chen","year":"2022"},{"key":"10.1016\/j.eswa.2026.132951_bib0009","series-title":"International conference on machine learning","first-page":"1597","article-title":"A simple framework for contrastive learning of visual representations","author":"Chen","year":"2020"},{"key":"10.1016\/j.eswa.2026.132951_bib0010","doi-asserted-by":"crossref","DOI":"10.1016\/j.bspc.2022.104093","article-title":"Incorporating support vector machine to the classification of respiratory sounds by convolutional neural network","volume":"79","author":"Cinyol","year":"2023","journal-title":"Biomedical Signal Processing and Control"},{"key":"10.1016\/j.eswa.2026.132951_bib0011","unstructured":"van den, O. A., Li, Y., & Vinyals, O. (2018). Representation learning with contrastive predictive coding. arXiv: 1807.03748."},{"key":"10.1016\/j.eswa.2026.132951_bib0012","doi-asserted-by":"crossref","DOI":"10.1016\/j.bspc.2025.107790","article-title":"Respiratory sounds classification by fusing the time-domain and 2D spectral features","volume":"107","author":"Dong","year":"2025","journal-title":"Biomedical Signal Processing and Control"},{"key":"10.1016\/j.eswa.2026.132951_bib0013","series-title":"2021 43rd annual international conference of the IEEE engineering in medicine & biology society (EMBC)","first-page":"527","article-title":"RespireNet: A deep neural network for accurately detecting abnormal lung sounds in limited data setting","author":"Gairola","year":"2021"},{"key":"10.1016\/j.eswa.2026.132951_bib0014","series-title":"ICASSP 2024-2024 IEEE international conference on acoustics, speech and signal processing (ICASSP)","first-page":"8626","article-title":"Multi-view spectrogram transformer for respiratory sound classification","author":"He","year":"2024"},{"key":"10.1016\/j.eswa.2026.132951_bib0015","series-title":"International conference on biomedical and health informatics","first-page":"39","article-title":"Hidden Markov model based respiratory sound classification","author":"Jakovljevi\u0107","year":"2017"},{"key":"10.1016\/j.eswa.2026.132951_bib0016","unstructured":"Jang, E., Gu, S., & Poole, B. (2016). Categorical reparameterization with gumbel-softmax. arXiv: 1611.01144."},{"issue":"24","key":"10.1016\/j.eswa.2026.132951_bib0017","doi-asserted-by":"crossref","first-page":"17029","DOI":"10.1007\/s00521-021-06295-x","article-title":"GTCC-based bilstm deep-learning framework for respiratory sound classification using empirical mode decomposition","volume":"33","author":"Jayalakshmy","year":"2021","journal-title":"Neural Computing and Applications"},{"issue":"1","key":"10.1016\/j.eswa.2026.132951_bib0018","doi-asserted-by":"crossref","first-page":"117","DOI":"10.1109\/TPAMI.2010.57","article-title":"Product quantization for nearest neighbor search","volume":"33","author":"Jegou","year":"2010","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"10.1016\/j.eswa.2026.132951_bib0019","unstructured":"Jiang, D., Lei, X., Li, W., Luo, N., Hu, Y., Zou, W., & Li, X. (2019). Improving transformer-based speech recognition using unsupervised pre-training. arXiv: 1910.09932."},{"key":"10.1016\/j.eswa.2026.132951_bib0020","series-title":"ICASSP 2024-2024 IEEE international conference on acoustics, speech and signal processing (ICASSP)","first-page":"1431","article-title":"Stethoscope-guided supervised contrastive learning for cross-domain adaptation on respiratory sound classification","author":"Kim","year":"2024"},{"key":"10.1016\/j.eswa.2026.132951_bib0021","doi-asserted-by":"crossref","unstructured":"Kim, J.-W., Toikkanen, M., Choi, Y., Moon, S.-E., & Jung, H.-Y. (2024b). BTS: Bridging text and sound modalities for metadata-aided respiratory sound classification. arXiv: 2406.06786.","DOI":"10.21437\/Interspeech.2024-492"},{"key":"10.1016\/j.eswa.2026.132951_bib0022","series-title":"Artificial neural networks and machine learning\u2013ICANN 2018: 27th international conference on artificial neural networks, Rhodes, Greece, October 4\u20137, 2018, proceedings, part III 27","first-page":"208","article-title":"Noise masking recurrent neural network for respiratory sound classification","author":"Kochetov","year":"2018"},{"key":"10.1016\/j.eswa.2026.132951_bib0023","series-title":"International conference on future data and security engineering","first-page":"556","article-title":"Feature fool exploitation for lightweight anomaly detection in respiratory sound","author":"Le","year":"2023"},{"key":"10.1016\/j.eswa.2026.132951_bib0024","doi-asserted-by":"crossref","first-page":"4834","DOI":"10.1109\/JBHI.2025.3545156","article-title":"Respiratory anomaly and disease detection using multi-level temporal convolutional networks","volume":"29","author":"Le","year":"2025","journal-title":"IEEE Journal of Biomedical and Health Informatics"},{"issue":"6","key":"10.1016\/j.eswa.2026.132951_bib0025","doi-asserted-by":"crossref","first-page":"1516","DOI":"10.1109\/TFUZZ.2022.3144448","article-title":"Explainable CNN with fuzzy tree regularization for respiratory sound analysis","volume":"30","author":"Li","year":"2022","journal-title":"IEEE Transactions on Fuzzy Systems"},{"key":"10.1016\/j.eswa.2026.132951_bib0026","series-title":"2016 Sixth international conference on image processing theory, tools and applications (IPTA)","first-page":"1","article-title":"Combining deep learning and hand-crafted features for skin lesion classification","author":"Majtner","year":"2016"},{"key":"10.1016\/j.eswa.2026.132951_bib0027","series-title":"2023 IEEE workshop on applications of signal processing to audio and acoustics (WASPAA)","first-page":"1","article-title":"Pretraining respiratory sound representations using metadata and contrastive learning","author":"Moummad","year":"2023"},{"key":"10.1016\/j.eswa.2026.132951_bib0028","series-title":"2021 IEEE 6th international conference on computing, communication and automation (ICCCA)","first-page":"517","article-title":"Respiratory disease classification by CNN using MFCC","author":"Mridha","year":"2021"},{"issue":"2","key":"10.1016\/j.eswa.2026.132951_bib0029","doi-asserted-by":"crossref","first-page":"19","DOI":"10.1007\/s10916-020-01681-9","article-title":"Automatic lung health screening using respiratory sounds","volume":"45","author":"Mukherjee","year":"2021","journal-title":"Journal of Medical Systems"},{"issue":"9","key":"10.1016\/j.eswa.2026.132951_bib0030","doi-asserted-by":"crossref","first-page":"2872","DOI":"10.1109\/TBME.2022.3156293","article-title":"Lung sound classification using co-tuning and stochastic normalization","volume":"69","author":"Nguyen","year":"2022","journal-title":"IEEE Transactions on Biomedical Engineering"},{"key":"10.1016\/j.eswa.2026.132951_bib0031","series-title":"2019 IEEE 32nd international symposium on computer-based medical systems (CBMS)","first-page":"50","article-title":"Deep auscultation: Predicting respiratory anomalies and diseases via recurrent neural networks","author":"Perna","year":"2019"},{"key":"10.1016\/j.eswa.2026.132951_bib0032","series-title":"2022 44th annual international conference of the IEEE engineering in medicine & biology society (EMBC)","first-page":"4595","article-title":"An ensemble of deep learning frameworks for predicting respiratory anomalies","author":"Pham","year":"2022"},{"key":"10.1016\/j.eswa.2026.132951_bib0033","series-title":"Precision medicine powered by phealth and connected health: ICBHI 2017, Thessaloniki, Greece, 18\u201321 november 2017","first-page":"33","article-title":"A respiratory sound database for the development of automated classification","author":"Rocha","year":"2018"},{"key":"10.1016\/j.eswa.2026.132951_bib0034","doi-asserted-by":"crossref","unstructured":"Schneider, S., Baevski, A., Collobert, R., & Auli, M. (2019). WAV2VEC: Unsupervised pre-training for speech recognition. arXiv: 1904.05862.","DOI":"10.21437\/Interspeech.2019-1873"},{"key":"10.1016\/j.eswa.2026.132951_bib0035","series-title":"ICASSP 2021-2021 IEEE international conference on acoustics, speech and signal processing (ICASSP)","first-page":"1275","article-title":"Contrastive embeddind learning method for respiratory sound classification","author":"Song","year":"2021"},{"key":"10.1016\/j.eswa.2026.132951_bib0036","series-title":"2023 8th international conference on signal and image processing (ICSIP)","first-page":"511","article-title":"Respiratory sound classification based on swin transformer","author":"Sun","year":"2023"},{"key":"10.1016\/j.eswa.2026.132951_bib0037","article-title":"Redt: A specialized transformer model for the respiratory phase and adventitious sound detection","volume":"025007","author":"Wang","year":"2025","journal-title":"Physiological Measurement"},{"issue":"6","key":"10.1016\/j.eswa.2026.132951_bib0038","doi-asserted-by":"crossref","first-page":"4749","DOI":"10.1109\/JIOT.2020.3028574","article-title":"Integrated multiple kernel learning for device-free localization in cluttered environments using spatiotemporal information","volume":"8","author":"Zhang","year":"2020","journal-title":"IEEE Internet of Things Journal"},{"issue":"24","key":"10.1016\/j.eswa.2026.132951_bib0039","doi-asserted-by":"crossref","first-page":"24587","DOI":"10.1109\/JIOT.2022.3192322","article-title":"Toward robust and accurate device-free localization in cluttered environments with commodity wifi devices","volume":"9","author":"Zhang","year":"2022","journal-title":"IEEE Internet of Things Journal"},{"key":"10.1016\/j.eswa.2026.132951_bib0040","series-title":"2022 IEEE biomedical circuits and systems conference (bioCAS)","first-page":"213","article-title":"Grand challenge on respiratory sound classification for SPRSound dataset","author":"Zhang","year":"2022"}],"container-title":["Expert Systems with Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0957417426018634?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0957417426018634?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,2]],"date-time":"2026-06-02T21:04:14Z","timestamp":1780434254000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0957417426018634"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,11]]},"references-count":40,"alternative-id":["S0957417426018634"],"URL":"https:\/\/doi.org\/10.1016\/j.eswa.2026.132951","relation":{},"ISSN":["0957-4174"],"issn-type":[{"value":"0957-4174","type":"print"}],"subject":[],"published":{"date-parts":[[2026,11]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"TIAMO: Text-image-audio multimodal model for respiratory sound classification","name":"articletitle","label":"Article Title"},{"value":"Expert Systems with Applications","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.eswa.2026.132951","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"132951"}}