{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T01:21:43Z","timestamp":1772846503456,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":50,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T00:00:00Z","timestamp":1733184000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Japan Society for the Promotion of Science KAKENHI","award":["JP21H03519"],"award-info":[{"award-number":["JP21H03519"]}]},{"name":"Japan Society for the Promotion of Science KAKENHI","award":["JP24H00733"],"award-info":[{"award-number":["JP24H00733"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,12,3]]},"DOI":"10.1145\/3696409.3700211","type":"proceedings-article","created":{"date-parts":[[2024,12,28]],"date-time":"2024-12-28T09:55:23Z","timestamp":1735379723000},"page":"1-1","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Action Selection Learning for Multi-label Multi-view Action Recognition"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8976-2922","authenticated-orcid":false,"given":"Trung Thanh","family":"Nguyen","sequence":"first","affiliation":[{"name":"Nagoya Univeristy, Nagoya, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3799-4550","authenticated-orcid":false,"given":"Yasutomo","family":"Kawanishi","sequence":"additional","affiliation":[{"name":"RIKEN, Kyoto, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3041-4330","authenticated-orcid":false,"given":"Takahiro","family":"Komamizu","sequence":"additional","affiliation":[{"name":"Nagoya University, Nagoya, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3942-9296","authenticated-orcid":false,"given":"Ichiro","family":"Ide","sequence":"additional","affiliation":[{"name":"Nagoya University, Nagoya, Japan"}]}],"member":"320","published-online":{"date-parts":[[2024,12,28]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"crossref","unstructured":"Reem Alfaifi and Abdel\u00a0Monim Artoli. 2020. Human action prediction with 3D-CNN. SN Computer Science 1:286 5 (2020) 1\u201315.","DOI":"10.1007\/s42979-020-00293-x"},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"e_1_3_3_2_4_2","unstructured":"Yue Bai Zhiqiang Tao Lichen Wang Sheng Li Yu Yin and Yun Fu. 2020. Collaborative attention mechanism for multi-view action recognition. Computing Research Repository arXiv Preprints arXiv:https:\/\/arXiv.org\/abs\/2009.06599 (2020)."},{"key":"e_1_3_3_2_5_2","first-page":"813","volume-title":"Proceedings of the 38th International Conference on Machine Learning","author":"Bertasius Gedas","year":"2021","unstructured":"Gedas Bertasius, Heng Wang, and Lorenzo Torresani. 2021. Is space-time attention all you need for video understanding?. In Proceedings of the 38th International Conference on Machine Learning. 813\u2013824."},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"crossref","unstructured":"Marc-Andr\u00e9 Carbonneau Veronika Cheplygina Eric Granger and Ghyslain Gagnon. 2018. Multiple instance learning: A survey of problem characteristics and applications. Pattern Recognition 77 (2018) 329\u2013353.","DOI":"10.1016\/j.patcog.2017.10.009"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00610"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/DSC61021.2023.10354225"},{"key":"e_1_3_3_2_9_2","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly Jakob Uszkoreit and Neil Houlsby. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. Computing Research Repository arXiv Preprints arXiv:https:\/\/arXiv.org\/abs\/2010.11929 (2020)."},{"key":"e_1_3_3_2_10_2","first-page":"2286","volume-title":"Proceedings of the 38th International Conference on Machine Learning","author":"d\u2019Ascoli St\u00e9phane","year":"2021","unstructured":"St\u00e9phane d\u2019Ascoli, Hugo Touvron, Matthew\u00a0L Leavitt, Ari\u00a0S Morcos, Giulio Biroli, and Levent Sagun. 2021. ConViT: Improving vision transformers with soft convolutional inductive biases. In Proceedings of the 38th International Conference on Machine Learning. 2286\u20132296."},{"key":"e_1_3_3_2_11_2","unstructured":"Ahmed\u00a0A Elngar Mohamed Arafa Amar Fathy Basma Moustafa Omar Mahmoud Mohamed Shaban and Nehal Fawzy. 2021. Image classification based on CNN: A survey. Journal of Cybersecurity and Information Management 6 1 (2021) 18\u201350."},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"crossref","unstructured":"Pranay Gupta Anirudh Thatipelli Aditya Aggarwal Shubh Maheshwari Neel Trivedi Sourav Das and Ravi\u00a0Kiran Sarvadevabhatla. 2021. Quo vadis skeleton action recognition? International Journal of Computer Vision 129 7 (2021) 2097\u20132112.","DOI":"10.1007\/s11263-021-01470-y"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","unstructured":"Younggi Hong Min\u00a0Ju Kim Isack Lee and Seok\u00a0Bong Yoo. 2023. Fluxformer: Flow-Guided Duplex Attention Transformer via Spatio-Temporal Clustering for Action Recognition. IEEE Robotics and Automation Letters 8 10 (2023) 6411\u20136418. 10.1109\/LRA.2023.3307285","DOI":"10.1109\/LRA.2023.3307285"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"crossref","unstructured":"Muhammad\u00a0Attique Khan Kashif Javed Sajid\u00a0Ali Khan Tanzila Saba Usman Habib Junaid\u00a0Ali Khan and Aaqif\u00a0Afzaal Abbasi. 2024. Human action recognition using fusion of multiview and deep features: An application to video surveillance. Multimedia Tools and Applications 83 5 (2024) 14885\u201314911.","DOI":"10.1007\/s11042-020-08806-9"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"crossref","unstructured":"Jun-Hwa Kim and Chee\u00a0Sun Won. 2020. Action recognition in videos using pre-trained 2D convolutional neural networks. IEEE Access 8 (2020) 60179\u201360188.","DOI":"10.1109\/ACCESS.2020.2983427"},{"key":"e_1_3_3_2_18_2","unstructured":"Diederik\u00a0P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. Computing Research Repository arXiv Preprints arXiv:https:\/\/arXiv.org\/abs\/1412.6980 (2014)."},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00722"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"crossref","unstructured":"Yu Kong and Yun Fu. 2022. Human action recognition and prediction: A survey. International Journal of Computer Vision 130 5 (2022) 1366\u20131401.","DOI":"10.1007\/s11263-022-01594-9"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"crossref","unstructured":"Jun Liu Amir Shahroudy Mauricio Perez Gang Wang Ling-Yu Duan and Alex\u00a0C Kot. 2019. NTU RGB+D 120: A large-scale benchmark for 3D human activity understanding. IEEE Transactions on Pattern Analysis and Machine Intelligence 42 10 (2019) 2684\u20132701.","DOI":"10.1109\/TPAMI.2019.2916873"},{"key":"e_1_3_3_2_22_2","unstructured":"Shilong Liu Lei Zhang Xiao Yang Hang Su and Jun Zhu. 2021. Query2Label: A simple transformer way to multi-label classification. Computing Research Repository arXiv Preprints arXiv:https:\/\/arXiv.org\/abs\/2107.10834 (2021)."},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00750"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00086"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"crossref","unstructured":"Adeshina\u00a0Sirajdin Olagoke Haidi Ibrahim and Soo\u00a0Siang Teoh. 2020. Literature survey on multi-camera system and its application. IEEE Access 8 (2020) 172892\u2013172922.","DOI":"10.1109\/ACCESS.2020.3024568"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"crossref","unstructured":"Preksha Pareek and Ankit Thakkar. 2021. A survey on video-based human action recognition: Recent updates datasets challenges and applications. Artificial Intelligence Review 54 3 (2021) 2259\u20132322.","DOI":"10.1007\/s10462-020-09904-8"},{"key":"e_1_3_3_2_27_2","first-page":"8748","volume-title":"Proceedings of the 38th International Conference on Machine Learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning transferable visual models from natural language supervision. In Proceedings of the 38th International Conference on Machine Learning. 8748\u20138763."},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_39"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-23808-6_10"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00338"},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.115"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"crossref","unstructured":"Zhongwei Shen Xiao-Jun Wu and Josef Kittler. 2021. 2D progressive fusion module for action recognition. Image and Vision Computing 109 (2021) 1\u201310.","DOI":"10.1016\/j.imavis.2021.104122"},{"key":"e_1_3_3_2_33_2","first-page":"38","volume-title":"Proceedings of the 15th Asian Conference on Computer Vision","volume":"5","author":"Shi Lei","year":"2020","unstructured":"Lei Shi, Yifan Zhang, Jian Cheng, and Hanqing Lu. 2020. Decoupled spatial-temporal attention network for skeleton-based action-gesture recognition. In Proceedings of the 15th Asian Conference on Computer Vision , Vol.\u00a05. 38\u201353."},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01316"},{"key":"e_1_3_3_2_35_2","unstructured":"Zehua Sun Qiuhong Ke Hossein Rahmani Mohammed Bennamoun Gang Wang and Jun Liu. 2022. Human action recognition from various data modalities: A review. IEEE Transactions on Pattern Analysis and Machine Intelligence 45 3 (2022) 3200\u20133225."},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"publisher","DOI":"10.5555\/3304889.3305036"},{"key":"e_1_3_3_2_37_2","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan\u00a0N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. Advances in Neural Information Processing Systems 30 (2017) 6000\u20136010."},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"crossref","unstructured":"Viacheslav Voronin Marina Zhdanova Evgenii Semenishchev Aleksander Zelenskii Yigang Cen and Sos Agaian. 2021. Action recognition for the robotics and manufacturing automation using 3-D binary micro-block difference. International Journal of Advanced Manufacturing Technology 117 (2021) 2319\u20132330.","DOI":"10.1007\/s00170-021-07613-2"},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58583-9_26"},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCCS55155.2022.9846526"},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"crossref","unstructured":"Qian Wang and Ke Chen. 2020. Multi-label zero-shot human action recognition via joint latent ranking embedding. Neural Networks 122 (2020) 1\u201323.","DOI":"10.1016\/j.neunet.2019.09.029"},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"crossref","unstructured":"Qiang Wang Gan Sun Jiahua Dong Qianqian Wang and Zhengming Ding. 2021. Continuous multi-view human action recognition. IEEE Transactions on Circuits and Systems for Video Technology 32 6 (2021) 3603\u20133614.","DOI":"10.1109\/TCSVT.2021.3112214"},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"crossref","unstructured":"Hanbo Wu Xin Ma and Yibin Li. 2021. Spatiotemporal multimodal learning with 3D CNNs for video action recognition. IEEE Transactions on Circuits and Systems for Video Technology 32 3 (2021) 1250\u20131261.","DOI":"10.1109\/TCSVT.2021.3077512"},{"key":"e_1_3_3_2_44_2","unstructured":"Masahiro Yasuda Noboru Harada Yasunori Ohishi Shoichiro Saito Akira Nakayama and Nobutaka Ono. 2024. Guided Masked Self-Distillation Modeling for Distributed Multimedia Sensor Event Analysis. Computing Research Repository arXiv Preprints arXiv:https:\/\/arXiv.org\/abs\/2404.08264 (2024)."},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746006"},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"crossref","unstructured":"Serena Yeung Olga Russakovsky Ning Jin Mykhaylo Andriluka Greg Mori and Li Fei-Fei. 2018. Every moment counts: Dense detailed labeling of actions in complex videos. International Journal of Computer Vision 126 (2018) 375\u2013389.","DOI":"10.1007\/s11263-017-1013-y"},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547980"},{"key":"e_1_3_3_2_48_2","doi-asserted-by":"crossref","unstructured":"Pengfei Zhang Cuiling Lan Junliang Xing Wenjun Zeng Jianru Xue and Nanning Zheng. 2019. View adaptive neural networks for high performance skeleton-based human action recognition. IEEE Transactions on Pattern Analysis and Machine Intelligence 41 8 (2019) 1963\u20131978.","DOI":"10.1109\/TPAMI.2019.2896631"},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01439"},{"key":"e_1_3_3_2_50_2","unstructured":"Zhilu Zhang and Mert Sabuncu. 2018. Generalized cross entropy loss for training deep neural networks with noisy labels. Advances in Neural Information Processing Systems 31 (2018) 8792\u20138802."},{"key":"e_1_3_3_2_51_2","doi-asserted-by":"crossref","unstructured":"Dawei Zhao Qingwei Gao Yixiang Lu and Dong Sun. 2023. Non-Aligned Multi-View Multi-Label Classification via Learning View-Specific Labels. IEEE Transactions on Multimedia 25 (2023) 7235\u20137247.","DOI":"10.1109\/TMM.2022.3219650"}],"event":{"name":"MMAsia '24: ACM Multimedia Asia","location":"Auckland New Zealand","acronym":"MMAsia '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 6th ACM International Conference on Multimedia in Asia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696409.3700211","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3696409.3700211","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:10:15Z","timestamp":1750295415000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696409.3700211"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,3]]},"references-count":50,"alternative-id":["10.1145\/3696409.3700211","10.1145\/3696409"],"URL":"https:\/\/doi.org\/10.1145\/3696409.3700211","relation":{},"subject":[],"published":{"date-parts":[[2024,12,3]]},"assertion":[{"value":"2024-12-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}