{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T08:05:00Z","timestamp":1761897900039},"reference-count":48,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,6,18]],"date-time":"2023-06-18T00:00:00Z","timestamp":1687046400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,6,18]],"date-time":"2023-06-18T00:00:00Z","timestamp":1687046400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,6,18]]},"DOI":"10.1109\/ijcnn54540.2023.10192036","type":"proceedings-article","created":{"date-parts":[[2023,8,2]],"date-time":"2023-08-02T17:30:03Z","timestamp":1690997403000},"page":"1-8","source":"Crossref","is-referenced-by-count":10,"title":["Multi-channel Attentive Weighting of Visual Frames for Multimodal Video Classification"],"prefix":"10.1109","author":[{"given":"Yuqing","family":"Wang","sequence":"first","affiliation":[{"name":"School of Software, Shandong University,Jinan,China"}]},{"given":"Zhuang","family":"Qi","sequence":"additional","affiliation":[{"name":"School of Software, Shandong University,Jinan,China"}]},{"given":"Xiangxian","family":"Li","sequence":"additional","affiliation":[{"name":"School of Software, Shandong University,Jinan,China"}]},{"given":"Jinxing","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Software, Shandong University,Jinan,China"}]},{"given":"Xiangxu","family":"Meng","sequence":"additional","affiliation":[{"name":"School of Software, Shandong University,Jinan,China"}]},{"given":"Lei","family":"Meng","sequence":"additional","affiliation":[{"name":"School of Software, Shandong University,Jinan,China"}]}],"member":"263","reference":[{"key":"ref1","article-title":"An image is worth 16\u00d716 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2020","journal-title":"arXiv preprint"},{"key":"ref2","article-title":"Vilt: Vision-and-language transformer without convolution or region supervision","author":"Kim","year":"2021","journal-title":"ICML"},{"key":"ref3","article-title":"Learnable pooling with context gating for video classification","author":"Miech","year":"2017","journal-title":"arXiv preprint"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2019.2924576"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2964297"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00718"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICWS.2019.00071"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-24797-2_4"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01467"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-02985-2"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-25075-0_36"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2015.2498625"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ijcnn.2012.6252397"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20497-5_47"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413598"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/3475724.3483600"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350870"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3126973.3129306"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/3475724.3483601"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN48605.2020.9206851"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/3460426.3463606"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1016\/j.ins.2022.10.083"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1145\/3512527.3531370"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.3003648"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1145\/2671188.2749362"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2013.47"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20497-5_34"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1137\/1.9781611973440.92"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2019.08.020"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i03.5651"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-35749-7_17"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.119"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2017.2778011"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.294"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.223"},{"article-title":"Efficientnet: Rethinking model scaling for con-volutional neural networks","volume-title":"International conference on machine learning","author":"Tan","key":"ref39"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-11018-5_19"},{"key":"ref41","article-title":"Two-stream convolutional networks for action recognition in videos","author":"Simonyan","year":"2014","journal-title":"Advances in neural information processing systems"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12319"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/W14-4012"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2017.2729019"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2021.03.091"},{"article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proceedings of ICML","author":"Radford","key":"ref46"},{"article-title":"Multi-modal Trans-former for Video Retrieval","volume-title":"Proceedings of ECCV","author":"Gabeur","key":"ref47"},{"key":"ref48","article-title":"Visualizing data using t-sne","author":"Van der Maaten","year":"2008","journal-title":"Journal of machine learning research"}],"event":{"name":"2023 International Joint Conference on Neural Networks (IJCNN)","start":{"date-parts":[[2023,6,18]]},"location":"Gold Coast, Australia","end":{"date-parts":[[2023,6,23]]}},"container-title":["2023 International Joint Conference on Neural Networks (IJCNN)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10190990\/10190992\/10192036.pdf?arnumber=10192036","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,1]],"date-time":"2024-03-01T15:31:23Z","timestamp":1709307083000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10192036\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6,18]]},"references-count":48,"URL":"https:\/\/doi.org\/10.1109\/ijcnn54540.2023.10192036","relation":{},"subject":[],"published":{"date-parts":[[2023,6,18]]}}}