{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:09:11Z","timestamp":1765339751344,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":22,"publisher":"ACM","funder":[{"name":"Vietnam National University - Ho Chi Minh City","award":["36-2024-44-02"],"award-info":[{"award-number":["36-2024-44-02"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3762093","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:54:15Z","timestamp":1761375255000},"page":"14197-14203","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Multi-Level CLS Token Fusion for Contrastive Learning in Endoscopy Image Classification"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-0021-5429","authenticated-orcid":false,"given":"Y Hop","family":"Nguyen","sequence":"first","affiliation":[{"name":"University of Science, VNU-HCM, Ho Chi Minh, Vietnam and South Telecom JSC, Ho Chi Minh, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-4512-0053","authenticated-orcid":false,"given":"Doan Anh","family":"Phan Huu","sequence":"additional","affiliation":[{"name":"South Telecom JSC, Ho Chi Minh, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-1422-9685","authenticated-orcid":false,"given":"Trung Thai","family":"Tran","sequence":"additional","affiliation":[{"name":"South Telecom JSC, Ho Chi Minh, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-3310-7562","authenticated-orcid":false,"given":"Nhat Nam","family":"Mai","sequence":"additional","affiliation":[{"name":"South Telecom JSC, Ho Chi Minh, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-0219-1735","authenticated-orcid":false,"given":"Van Toi","family":"Giap","sequence":"additional","affiliation":[{"name":"South Telecom JSC, Ho Chi Minh, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0109-1114","authenticated-orcid":false,"given":"Thao Thi Phuong","family":"Dao","sequence":"additional","affiliation":[{"name":"University of Science, VNU-HCM, Ho Chi Minh, Vietnam and Thong Nhat Hospital, Ho Chi Minh, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7363-2610","authenticated-orcid":false,"given":"Trung-Nghia","family":"Le","sequence":"additional","affiliation":[{"name":"University of Science, VNU-HCM, Ho Chi Minh, Vietnam"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diego Gonz\u00e1lez Mor\u00edn, and Lucas Rod\u00e9s-Guirao","author":"Baldassarre Federico","year":"2017","unstructured":"Federico Baldassarre, Diego Gonz\u00e1lez Mor\u00edn, and Lucas Rod\u00e9s-Guirao. 2017. Deep koalarization: Image colorization using cnns and inception-resnet-v2. arXiv preprint arXiv:1712.03400 (2017)."},{"key":"e_1_3_2_1_2_1","unstructured":"Krishna Chaitanya Naseer Karani Christian Baumgartner et al. 2020. Contrastive learning of global and local features for medical image segmentation with limited annotations. In NeurIPS."},{"key":"e_1_3_2_1_3_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"volume-title":"Deep Learning","author":"Goodfellow Ian","key":"e_1_3_2_1_4_1","unstructured":"Ian Goodfellow, Yoshua Bengio, and Aaron Courville. 2016. Deep Learning. MIT Press. https:\/\/www.deeplearningbook.org."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_6_1","unstructured":"Yu He Xiang Peng Jun Zhao et al. 2021. TransFG: A Transformer Architecture for Fine-grained Recognition. In AAAI."},{"key":"e_1_3_2_1_7_1","first-page":"3","article-title":"Lora: Low-rank adaptation of large language models","volume":"1","author":"Hu Edward J","year":"2022","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen, et al., 2022. Lora: Low-rank adaptation of large language models. ICLR, Vol. 1, 2 (2022), 3.","journal-title":"ICLR"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Xinyu Huang Yuhao Zhang Zhi Chen et al. 2021. GLoRIA: A Multimodal Global-Local Representation Learning for Medical Images and Reports. In CVPR.","DOI":"10.1109\/ICCV48922.2021.00391"},{"key":"e_1_3_2_1_9_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_1_10_1","volume-title":"Switzerland)","author":"Mukhtorov Doniyorjon","year":"2023","unstructured":"Doniyorjon Mukhtorov, Madinakhon Rakhmonova, Muksimova Shakhnoza, and Young Im Cho. 2023. Endoscopic Image Classification Based on Explainable Deep Learning. Sensors (Basel, Switzerland), Vol. 23 (2023). https:\/\/api.semanticscholar.org\/CorpusID:257620420"},{"key":"e_1_3_2_1_11_1","volume-title":"Ha Nguyen Thi, Tien To Vu Thuy, Uyen Hanh Tran, Tam V. Nguyen, Thanh Dinh Le, and Minh-Triet Tran.","author":"Nguyen Trong-Thuan","year":"2025","unstructured":"Trong-Thuan Nguyen, Viet-Tham Huynh, Thao Thi Phuong Dao, Ha Nguyen Thi, Tien To Vu Thuy, Uyen Hanh Tran, Tam V. Nguyen, Thanh Dinh Le, and Minh-Triet Tran. 2025. ACM Multimedia Grand Challenge on ENT Endoscopy Analysis. arXiv preprint arXiv2508.04801 (2025)."},{"key":"e_1_3_2_1_12_1","volume-title":"An introduction to convolutional neural networks. arXiv preprint arXiv:1511.08458","author":"O'shea Keiron","year":"2015","unstructured":"Keiron O'shea and Ryan Nash. 2015. An introduction to convolutional neural networks. arXiv preprint arXiv:1511.08458 (2015)."},{"key":"e_1_3_2_1_13_1","volume-title":"International conference on machine learning. PmLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748-8763."},{"key":"e_1_3_2_1_14_1","unstructured":"Yongming Rao Wenliang Zhao Jiwen Lu and Jie Zhou. 2022. TokenFusion: Facilitating Information Fusion in Token-Based Transformers for Multi-Modal Learning. In ECCV."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00474"},{"key":"e_1_3_2_1_16_1","volume-title":"Grad-CAM: Why did you say that? arXiv preprint arXiv:1611.07450","author":"Selvaraju Ramprasaath R","year":"2016","unstructured":"Ramprasaath R Selvaraju, Abhishek Das, Ramakrishna Vedantam, Michael Cogswell, Devi Parikh, and Dhruv Batra. 2016. Grad-CAM: Why did you say that? arXiv preprint arXiv:1611.07450 (2016)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"Aliza Subedi Smriti Regmi Nisha Regmi Bhumi Bhusal Ulas Bagci and Debesh Jha. 2024. Classification of Endoscopy and Video Capsule Images using CNN-Transformer Model. In CaPTion@MICCAI. https:\/\/api.semanticscholar.org\/CorpusID:271909358","DOI":"10.1007\/978-3-031-73376-5_3"},{"key":"e_1_3_2_1_18_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00020"},{"key":"e_1_3_2_1_20_1","unstructured":"Yuhao Zhang Xiaoman Liu Chunyuan Tao et al. 2023. BioMedCLIP: Medical Vision-Language Pretraining with Biomedical Knowledge. arXiv preprint arXiv:2303.09044 (2023)."},{"key":"e_1_3_2_1_21_1","unstructured":"Yuhao Zhang Chunyuan Tao Shengjie Luan et al. 2022. Contrastive learning of medical visual representations from paired images and text. In NeurIPS."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01631"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3762093","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:04:11Z","timestamp":1765339451000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3762093"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":22,"alternative-id":["10.1145\/3746027.3762093","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3762093","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}