{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,17]],"date-time":"2026-03-17T03:07:18Z","timestamp":1773716838500,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":39,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T00:00:00Z","timestamp":1717027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100006374","name":"National Science and Technology Council","doi-asserted-by":"publisher","award":["111-2221-E-003-016-MY2,110-2634-F-002-050"],"award-info":[{"award-number":["111-2221-E-003-016-MY2,110-2634-F-002-050"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,30]]},"DOI":"10.1145\/3652583.3658026","type":"proceedings-article","created":{"date-parts":[[2024,6,7]],"date-time":"2024-06-07T06:30:40Z","timestamp":1717741840000},"page":"934-942","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Self-Supervised Multi-Label Classification with Global Context and Local Attention"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-8469-9247","authenticated-orcid":false,"given":"Chun-Yen","family":"Chen","sequence":"first","affiliation":[{"name":"National Taiwan Normal University, Taipei, Taiwan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8665-7860","authenticated-orcid":false,"given":"Mei-Chen","family":"Yeh","sequence":"additional","affiliation":[{"name":"National Taiwan Normal University, Taipei, Taiwan"}]}],"member":"320","published-online":{"date-parts":[[2024,6,7]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Y. Li A. Oord and O. Vinyals. 2018. Representation Learning with Contrastive Predictive Coding. In arXiv:1807.03749."},{"key":"e_1_3_2_1_2_1","unstructured":"M. Caron I. Misra J. Mairal P. Goyal P. Bojanowski and A. Joulin. 2020. Unsupervised learning of visual features by contrasting cluster assignments. In arXiv:2006.09882."},{"key":"e_1_3_2_1_3_1","volume-title":"International Conference on Machine Learning.","author":"Chen T.","unstructured":"T. Chen, S. Kornblith, M. Norouzi, and G. Hinton. 2020. A simple framework for contrastive learning of visual representations. In International Conference on Machine Learning."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.167"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-009-0275-4"},{"key":"e_1_3_2_1_6_1","volume-title":"Devise: A deep visual-semantic embedding model. In Neural Information Processing Systems.","author":"Frome A.","year":"2013","unstructured":"A. Frome, G. Corrado, J. Shlens, S. Bengio, J. Dean, M. Ranzato, and T. Mikolov. 2013. Devise: A deep visual-semantic embedding model. In Neural Information Processing Systems."},{"key":"e_1_3_2_1_7_1","volume-title":"IEEE International Conference on Computer Vision.","author":"Gansbeke W. V.","unstructured":"W. V. Gansbeke, S. Vandenhende, S. Georgoulis, and L. V. Gool. 2021. Unsupervised Semantic Segmentation by Contrasting Object Mask Proposals. In IEEE International Conference on Computer Vision."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3088605"},{"key":"e_1_3_2_1_9_1","volume-title":"International Conference on Learning Representations.","author":"Gidaris S.","unstructured":"S. Gidaris, P. Singh, and N. Komodakis. 2018. Unsupervised representation learning by predicting image rotations. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_10_1","volume-title":"Zhaohan Daniel Guo, Mohammad Gheshlaghi Azar, and et al.","author":"Grill Jean-Bastien","year":"2020","unstructured":"Jean-Bastien Grill, Florian Strub, Florent Altche, Corentin \u00b4 Tallec, Pierre H Richemond, Elena Buchatskaya, Carl Doersch, Bernardo Avila Pires, Zhaohan Daniel Guo, Mohammad Gheshlaghi Azar, and et al. 2020. Bootstrap your own latent: A new approach to self-supervised learning. In arXiv:2006.07733."},{"key":"e_1_3_2_1_11_1","volume-title":"IEEE International Conference on Computer Vision and Pattern Recognition.","author":"Guo H.","unstructured":"H. Guo, K. Zheng, X. Fan, H. Yu, and S. Wang. 2019. Visual attention consistency under image transforms for multi-label image classification. In IEEE International Conference on Computer Vision and Pattern Recognition."},{"key":"e_1_3_2_1_12_1","volume-title":"Masked Autoencoders Are Scalable Vision Learners. In IEEE International Conference on Computer Vision and Pattern Recognition.","author":"He K.","unstructured":"K. He, X. Chen, S. Xie, Y. Li, P. Doll\u00b4, and R. Girshick. 2022. Masked Autoencoders Are Scalable Vision Learners. In IEEE International Conference on Computer Vision and Pattern Recognition."},{"key":"e_1_3_2_1_13_1","volume-title":"IEEE International Conference on Computer Vision and Pattern Recognition.","author":"He K.","unstructured":"K. He, H. Fan, Y. Wu, S. Xie, and R. Girshick. 2020. Momentum contrast for unsupervised visual representation learning. In IEEE International Conference on Computer Vision and Pattern Recognition."},{"key":"e_1_3_2_1_14_1","volume-title":"Self-Supervised Pyramid Representation Learning for Multi-Label Visual Analysis and Beyond. In IEEE\/CVF Winter Conference on Applications of Computer Vision.","author":"Hsieh C. Y.","unstructured":"C. Y. Hsieh, C. J. Chang, F. E. Yang, and Y. C. Frank Wang. 2023. Self-Supervised Pyramid Representation Learning for Multi-Label Visual Analysis and Beyond. In IEEE\/CVF Winter Conference on Applications of Computer Vision."},{"key":"e_1_3_2_1_15_1","volume-title":"IEEE International Conference on Computer Vision and Pattern Recognition.","author":"Huang L.","unstructured":"L. Huang, S. You, M. Zheng, F. Wang, C. Qian, and T. Yamasaki. 2022. Learning where to learn in cross-view self-supervised learning. In IEEE International Conference on Computer Vision and Pattern Recognition."},{"key":"e_1_3_2_1_16_1","volume-title":"Efficient Visual Pretraining with Contrastive Detection. In IEEE International Conference on Computer Vision.","author":"H\u00e9naff O. J.","unstructured":"O. J. H\u00e9naff, S. Koppula, J. B. Alayrac, A. Oord, O. Vinyals, and J. Carreira. 2021. Efficient Visual Pretraining with Contrastive Detection. In IEEE International Conference on Computer Vision."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"O. J. H\u00e9naff S. Koppula E. Shelhamer D. Zoran A. Jaegle A. Zisserman J. Carreira and R. Arandjelovi\u00b4c. 2022. Object Discovery and Representation Networks. In arXiv:2203.08777.","DOI":"10.1007\/978-3-031-19812-0_8"},{"key":"e_1_3_2_1_18_1","volume-title":"Prototypical Contrastive Learning of Unsupervised Representations. In International Conference on Learning Representations.","author":"Li J.","unstructured":"J. Li, P. Zhou, C. Xiong, and S. Hoi. 2021. Prototypical Contrastive Learning of Unsupervised Representations. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_20_1","volume-title":"IEEE International Conference on Computer Vision and Pattern Recognition.","author":"Misra I.","unstructured":"I. Misra and L. van der Maaten. 2020. Self-supervised learning of pretext-invariant representations. In IEEE International Conference on Computer Vision and Pattern Recognition."},{"key":"e_1_3_2_1_21_1","volume-title":"European Conference on Computer Vision.","author":"Noroozi M.","unstructured":"M. Noroozi and P. Favaro. 2016. Unsupervised learning of visual representations by solving jigsaw puzzles. In European Conference on Computer Vision."},{"key":"e_1_3_2_1_22_1","volume-title":"International Conference on Learning Representations.","author":"Norouzi M.","unstructured":"M. Norouzi, T. Mikolov, S. Bengio, Y. Singer, J. Shlens, A. Frome, G. S. Corrado, and J. Dean. 2014. Zero-Shot Learning by Convex Combination of Semantic Embeddings. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_23_1","unstructured":"S. Purushwalkam and A. Gupta. 2020. Demystifying contrastive self-supervised learning: Invariances augmentations and dataset biases. In Neural Information Processing Systems."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10994-011-5256-5"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"O. Russakovsky* J. Deng* H. Su J. Krause S. Satheesh S. Ma Z. Huang A. Karpathy A. Khosla M. Bernstein A. C. Berg and Li Fei-Fei. 2014. ImageNet Large Scale Visual Recognition Challenge. In arXiv:1409.0575.","DOI":"10.1007\/s11263-015-0816-y"},{"key":"e_1_3_2_1_26_1","volume-title":"IEEE International Conference on Computer Vision and Pattern Recognition.","author":"Selvaraju R. R.","unstructured":"R. R. Selvaraju, K. Desai, J. Johnson, and N. Naik. 2021. CASTing Your Model: Learning to Localize Improves Self-Supervised Representations. In IEEE International Conference on Computer Vision and Pattern Recognition."},{"key":"e_1_3_2_1_27_1","volume-title":"European Conference on Machine Learning.","author":"Tsoumakas G.","unstructured":"G. Tsoumakas and I. Vlahavas. 2007. Random k-labelsets: An ensemble method for multilabel classification. In European Conference on Machine Learning."},{"key":"e_1_3_2_1_28_1","volume-title":"CNN-RNN: A Unified Framework for Multi-Label Image Classification. In IEEE International Conference on Computer Vision and Pattern Recognition.","author":"Wang J.","unstructured":"J. Wang, Y. Yang, J. Mao, Z. Huang, C. Huang, and W. Xu. 2016. CNN-RNN: A Unified Framework for Multi-Label Image Classification. In IEEE International Conference on Computer Vision and Pattern Recognition."},{"key":"e_1_3_2_1_29_1","volume-title":"IEEE International Conference on Computer Vision and Pattern Recognition.","author":"Wang X.","unstructured":"X. Wang, R. Zhang, C. Shen, T. Kong, and L. Li. 2021. Dense contrastive learning for self-supervised visual pre-training. In IEEE International Conference on Computer Vision and Pattern Recognition."},{"key":"e_1_3_2_1_30_1","unstructured":"F. Wei Y. Gao Z. Wu H. Hu and S. Lin. 2021. Aligning Pretraining for Detection via Object-Level Contrastive Learning. In Neural Information Processing Systems."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2015.2491929"},{"key":"e_1_3_2_1_32_1","volume-title":"IEEE International Conference on Computer Vision.","author":"Xie E.","unstructured":"E. Xie, J. Ding, W. Wang, X. Zhan, H. Xu, P. Sun, Z. Li, and P. Luo. 2021. Detco: Unsupervised contrastive learning for object detection. In IEEE International Conference on Computer Vision."},{"key":"e_1_3_2_1_33_1","volume-title":"Instance Localization for Self-Supervised Detection Pretraining. In IEEE International Conference on Computer Vision and Pattern Recognition.","author":"Yang C.","unstructured":"C. Yang, Z. Wu, B. Zhou, and S. Lin. 2021a. Instance Localization for Self-Supervised Detection Pretraining. In IEEE International Conference on Computer Vision and Pattern Recognition."},{"key":"e_1_3_2_1_34_1","volume-title":"IEEE International Conference on Computer Vision and Pattern Recognition.","author":"Yang C.","unstructured":"C. Yang, Z. Wu, B. Zhou, and S. Lin. 2021b. Instance localization for self-supervised detection pretraining. In IEEE International Conference on Computer Vision and Pattern Recognition."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2911065"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2006.12.019"},{"key":"e_1_3_2_1_37_1","unstructured":"X. Zhang and M. Maire. 2020. Self-Supervised Visual Representation Learning from Hierarchical Grouping. In Neural Information Processing Systems."},{"key":"e_1_3_2_1_38_1","volume-title":"IEEE International Conference on Computer Vision and Pattern Recognition.","author":"Zhao Y.","unstructured":"Y. Zhao, G. Wang, C. Luo, W. Zeng, and Z. J. Zha. 2021. Self-supervised visual representations learning by contrastive mask prediction. In IEEE International Conference on Computer Vision and Pattern Recognition."},{"key":"e_1_3_2_1_39_1","volume-title":"IEEE International Conference on Computer Vision and Pattern Recognition.","author":"Zhu F.","unstructured":"F. Zhu, H. Li, W. Ouyang, N. Yu, and X. Wang. 2017. Learning spatial regularization with imagelevel supervisions for multi-label image classification. In IEEE International Conference on Computer Vision and Pattern Recognition."}],"event":{"name":"ICMR '24: International Conference on Multimedia Retrieval","location":"Phuket Thailand","acronym":"ICMR '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia","SIGSOFT ACM Special Interest Group on Software Engineering"]},"container-title":["Proceedings of the 2024 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658026","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3652583.3658026","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T08:48:44Z","timestamp":1755766124000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658026"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,30]]},"references-count":39,"alternative-id":["10.1145\/3652583.3658026","10.1145\/3652583"],"URL":"https:\/\/doi.org\/10.1145\/3652583.3658026","relation":{},"subject":[],"published":{"date-parts":[[2024,5,30]]},"assertion":[{"value":"2024-06-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}