{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,29]],"date-time":"2025-08-29T17:40:05Z","timestamp":1756489205042,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":23,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,9,20]],"date-time":"2023-09-20T00:00:00Z","timestamp":1695168000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,9,20]]},"DOI":"10.1145\/3617233.3617248","type":"proceedings-article","created":{"date-parts":[[2023,12,30]],"date-time":"2023-12-30T06:05:32Z","timestamp":1703916332000},"page":"238-242","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Deep Learning Based Multimodal with Two-phase Training Strategy for Daily Life Video Classification"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8155-7553","authenticated-orcid":false,"given":"Lam","family":"Pham","sequence":"first","affiliation":[{"name":"Austrian Institute of Technology, Austria"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-9148-2848","authenticated-orcid":false,"given":"Trang","family":"Le","sequence":"additional","affiliation":[{"name":"JVN Institute-VNU, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-8883-4677","authenticated-orcid":false,"given":"Cam","family":"Le","sequence":"additional","affiliation":[{"name":"HCM University of Technology, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4374-5056","authenticated-orcid":false,"given":"Dat","family":"Ngo","sequence":"additional","affiliation":[{"name":"University of Essex, UK"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7246-2744","authenticated-orcid":false,"given":"Axel","family":"Weissenfeld","sequence":"additional","affiliation":[{"name":"AIT Austrian Institute of Technology GmbH, Austria"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4881-6741","authenticated-orcid":false,"given":"Alexander","family":"Schindler","sequence":"additional","affiliation":[{"name":"Austrian Institute of Technology, Austria"}]}],"member":"320","published-online":{"date-parts":[[2023,12,30]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Audio-Visual Scene Classification Using A Transfer Learning Based Joint Optimization Strategy. arXiv preprint arXiv:2204.11420","author":"Chen Chengxin","year":"2022","unstructured":"Chengxin Chen, Meng Wang, and Pengyuan Zhang. 2022. Audio-Visual Scene Classification Using A Transfer Learning Based Joint Optimization Strategy. arXiv preprint arXiv:2204.11420 (2022)."},{"key":"e_1_3_2_1_2_1","unstructured":"Fran\u00e7ois Chollet 2015. Keras. https:\/\/keras.io."},{"key":"e_1_3_2_1_3_1","volume-title":"Lip Reading Sentences in the Wild. In IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 3444\u20133453","author":"Chung Joon\u00a0Son","year":"2017","unstructured":"Joon\u00a0Son Chung, A. Senior, Oriol Vinyals, and Andrew Zisserman. 2017. Lip Reading Sentences in the Wild. In IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 3444\u20133453."},{"key":"e_1_3_2_1_4_1","volume-title":"Classification of Acoustic\u00a0Scenes, and Events Community","author":"Detection","year":"2021","unstructured":"Detection, Classification of Acoustic\u00a0Scenes, and Events Community. 2021. DCASE Challenges Task 1A. http:\/\/dcase.community\/challenge2021."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/MMSP55362.2022.9949447"},{"key":"e_1_3_2_1_7_1","volume-title":"Kingma and Jimmy Ba","author":"P.","year":"2015","unstructured":"Diederik\u00a0P. Kingma and Jimmy Ba. 2015. Adam: A Method for Stochastic Optimization. CoRR abs\/1412.6980 (2015)."},{"key":"e_1_3_2_1_8_1","volume-title":"Proc. DCASE. 16\u201320","author":"Naranjo-Alcazar Javier","year":"2021","unstructured":"Javier Naranjo-Alcazar, Sergi Perez-Castanos, Aaron Lopez-Garcia, Pedro Zuccarello, Maximo Cobos, and Francesc\u00a0J Ferri. 2021. Squeeze-excitation convolutional recurrent neural networks for audio-visual scene classification. In Proc. DCASE. 16\u201320."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.47839\/ijc.21.2.2595"},{"key":"e_1_3_2_1_10_1","volume-title":"Proc. DCASE. 95\u201399","author":"Okazaki Soichiro","year":"2021","unstructured":"Soichiro Okazaki, Quan Kong, and Tomoaki Yoshinaga. 2021. A Multi-Modal Fusion Approach for Audio-Visual Scene Classification Enhanced by CLIP Variants.. In Proc. DCASE. 95\u201399."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.dsp.2020.102943"},{"key":"e_1_3_2_1_12_1","volume-title":"arXiv preprint arXiv:2210.08610","author":"Pham Lam","year":"2022","unstructured":"Lam Pham, Dusan Salovic, Anahid Jalali, Alexander Schindler, Khoa Tran, Canh Vu, and Phu\u00a0X Nguyen. 2022. Robust, General, and Low Complexity Acoustic Scene Classification Systems and An Effective Visualization for Presenting a Sound Scene Context. arXiv preprint arXiv:2210.08610 (2022)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3551626.3564962"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414551"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCAS45731.2020.9181210"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2017.2751969"},{"key":"e_1_3_2_1_18_1","volume-title":"International Conference on Learning Representations (ICLR).","author":"Tokozume Yuji","year":"2018","unstructured":"Yuji Tokozume, Yoshitaka Ushiku, and Tatsuya Harada. 2018. Learning from between-class examples for deep sound recognition. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCSLP57327.2022.10038206"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9415085"},{"key":"e_1_3_2_1_21_1","volume-title":"InceptionNeXt: When Inception Meets ConvNeXt. arXiv preprint arXiv:2303.16900","author":"Yu Weihao","year":"2023","unstructured":"Weihao Yu, Pan Zhou, Shuicheng Yan, and Xinchao Wang. 2023. InceptionNeXt: When Inception Meets ConvNeXt. arXiv preprint arXiv:2303.16900 (2023)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2017.2719043"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIM.2023.3260282"}],"event":{"name":"CBMI 2023: 20th International Conference on Content-based Multimedia Indexing","acronym":"CBMI 2023","location":"Orleans France"},"container-title":["20th International Conference on Content-based Multimedia Indexing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3617233.3617248","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3617233.3617248","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,29]],"date-time":"2025-08-29T16:59:47Z","timestamp":1756486787000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3617233.3617248"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,9,20]]},"references-count":23,"alternative-id":["10.1145\/3617233.3617248","10.1145\/3617233"],"URL":"https:\/\/doi.org\/10.1145\/3617233.3617248","relation":{},"subject":[],"published":{"date-parts":[[2023,9,20]]},"assertion":[{"value":"2023-12-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}