{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,8]],"date-time":"2026-07-08T16:58:33Z","timestamp":1783529913179,"version":"3.55.0"},"publisher-location":"New York, NY, USA","reference-count":59,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Natural Science Foudnation of China (NSFC)","award":["U1936205"],"award-info":[{"award-number":["U1936205"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612523","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"4421-4430","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["Zero-Shot Object Detection by Semantics-Aware DETR with Adaptive Contrastive Loss"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-2569-3736","authenticated-orcid":false,"given":"Huan","family":"Liu","sequence":"first","affiliation":[{"name":"Fudan University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9532-5219","authenticated-orcid":false,"given":"Lu","family":"Zhang","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2313-7635","authenticated-orcid":false,"given":"Jihong","family":"Guan","sequence":"additional","affiliation":[{"name":"Tongji University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1949-2768","authenticated-orcid":false,"given":"Shuigeng","family":"Zhou","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Zero-Shot Object Detection. European Conference on Computer Vision (ECCV)","author":"Bansal Ankan","year":"2018","unstructured":"Ankan Bansal, Karan Sikka, Gaurav Sharma, Rama Chellappa, and Ajay Divakaran. 2018. Zero-Shot Object Detection. European Conference on Computer Vision (ECCV) (2018)."},{"key":"e_1_3_2_1_2_1","volume-title":"Boult","author":"Bendale Abhijit","year":"2016","unstructured":"Abhijit Bendale and Terrance E. Boult. 2016. Towards Open Set Deep Networks. In Computer Vision and Pattern Recognition (CVPR). IEEE, 1563--1572."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Zhaowei Cai and Nuno Vasconcelos. 2018. Cascade R-CNN: Delving Into High Quality Object Detection.. In Computer Vision and Pattern Recognition (CVPR). 6154--6162.","DOI":"10.1109\/CVPR.2018.00644"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Soravit Changpinyo Wei-Lun Chao Boqing Gong and Fei Sha. 2016. Synthesized Classifiers for Zero-Shot Learning. In Computer Vision and Pattern Recognition (CVPR). 5327--5336.","DOI":"10.1109\/CVPR.2016.575"},{"key":"e_1_3_2_1_6_1","volume-title":"International Conference on Machine Learning (ICML). 1597--1607","author":"Chen Ting","unstructured":"Ting Chen, Simon Kornblith, Mohammad Norouzi, and Geoffrey E. Hinton. 2020. A Simple Framework for Contrastive Learning of Visual Representations.. In International Conference on Machine Learning (ICML). 1597--1607."},{"key":"e_1_3_2_1_7_1","volume-title":"Conference on Neural Information Processing Systems (NeurIPS). 379--387","author":"Dai Jifeng","year":"2016","unstructured":"Jifeng Dai, Yi Li, Kaiming He, and Jian Sun. 2016. R-FCN: Object Detection via Region-based Fully Convolutional Networks.. In Conference on Neural Information Processing Systems (NeurIPS). 379--387."},{"key":"e_1_3_2_1_8_1","volume-title":"British Machine Vision Conference (BMVC). 56","author":"Demirel Berkan","year":"2018","unstructured":"Berkan Demirel, Ramazan Gokberk Cinbis, and Nazli Ikizler-Cinbis. 2018. Zero-Shot Object Detection by Hybrid Region Embedding.. In British Machine Vision Conference (BMVC). 56."},{"key":"e_1_3_2_1_9_1","volume-title":"International Conference on Learning Representations (ICLR).","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale.. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-009-0275-4"},{"key":"e_1_3_2_1_11_1","volume-title":"Forsyth","author":"Farhadi Ali","year":"2009","unstructured":"Ali Farhadi, Ian Endres, Derek Hoiem, and David A. Forsyth. 2009. Describing objects by their attributes. In Computer Vision and Pattern Recognition (CVPR). 1778--1785."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2015.2408354"},{"key":"e_1_3_2_1_13_1","unstructured":"Michael Gutmann and Aapo Hyv\u00e4rinen. 2010. Noise-contrastive estimation: A new estimation principle for unnormalized statistical models. In Journal of Machine Learning Research (JMLR). 297--304."},{"key":"e_1_3_2_1_14_1","volume-title":"Synthesizing the Unseen for Zero-Shot Object Detection. In Asian Conference on Computer Vision (ACCV). 155--170","author":"Hayat Nasir","year":"2020","unstructured":"Nasir Hayat, Munawar Hayat, Shafin Rahman, Salman H. Khan, Syed Waqas Zamir, and Fahad Shahbaz Khan. 2020. Synthesizing the Unseen for Zero-Shot Object Detection. In Asian Conference on Computer Vision (ACCV). 155--170."},{"key":"e_1_3_2_1_15_1","volume-title":"Momentum Contrast for Unsupervised Visual Representation Learning. In 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 9726--9735","author":"He Kaiming","year":"2020","unstructured":"Kaiming He, Haoqi Fan, Yuxin Wu, Saining Xie, and Ross Girshick. 2020. Momentum Contrast for Unsupervised Visual Representation Learning. In 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 9726--9735."},{"key":"e_1_3_2_1_16_1","volume-title":"IEEE International Conference on Computer Vision (ICCV). 2980--2988","author":"He Kaiming","unstructured":"Kaiming He, Georgia Gkioxari, Piotr Doll\u00e1r, and Ross B. Girshick. 2017. Mask R-CNN.. In IEEE International Conference on Computer Vision (ICCV). 2980--2988."},{"key":"e_1_3_2_1_17_1","volume-title":"Deep Residual Learning for Image Recognition. In IEEE Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 770--778","author":"He Kaiming","year":"2016","unstructured":"Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. 2016. Deep Residual Learning for Image Recognition. In IEEE Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 770--778."},{"key":"e_1_3_2_1_18_1","volume-title":"Robust Region Feature Synthesizer for Zero-Shot Object Detection. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 7622--7631","author":"Huang Peiliang","year":"2022","unstructured":"Peiliang Huang, Junwei Han, De Cheng, and Dingwen Zhang. 2022. Robust Region Feature Synthesizer for Zero-Shot Object Detection. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 7622--7631."},{"key":"e_1_3_2_1_19_1","volume-title":"Conference on Neural Information Processing Systems (NeurIPS).","author":"Khosla Prannay","year":"2020","unstructured":"Prannay Khosla, Piotr Teterwak, Chen Wang, Aaron Sarna, Yonglong Tian, Phillip Isola, Aaron Maschinot, Ce Liu, and Dilip Krishnan. 2020. Supervised Contrastive Learning.. In Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_20_1","volume-title":"International Conference on Learning Representations (ICLR).","author":"Diederik","unstructured":"Diederik P. Kingma and Max Welling. 2014. Auto-Encoding Variational Bayes.. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"Elyor Kodirov Tao Xiang and Shaogang Gong. 2017. Semantic Autoencoder for Zero-Shot Learning. In Computer Vision and Pattern Recognition (CVPR). 4447--4456.","DOI":"10.1109\/CVPR.2017.473"},{"key":"e_1_3_2_1_22_1","volume-title":"The Hungarian Method for the Assignment Problem","author":"Kuhn Harold W.","unstructured":"Harold W. Kuhn. 2010. The Hungarian Method for the Assignment Problem. Springer Berlin Heidelberg. 29--47 pages."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206594"},{"key":"e_1_3_2_1_24_1","volume-title":"Computer Vision and Pattern Recognition (CVPR)","author":"Li Yanan","unstructured":"Yanan Li, Donghui Wang, Huanhang Hu, Yuetan Lin, and Yueting Zhuang. 2017. Zero-Shot Recognition Using Dual Visual-Semantic Mapping Paths. In Computer Vision and Pattern Recognition (CVPR). IEEE, 5207--5215."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018690"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2858826"},{"key":"e_1_3_2_1_27_1","volume-title":"European Conference on Computer Vision (ECCV). 740--755","author":"Lin Tsung-Yi","unstructured":"Tsung-Yi Lin, Michael Maire, Serge J. Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C. Lawrence Zitnick. 2014. Microsoft COCO: Common Objects in Context.. In European Conference on Computer Vision (ECCV). 740--755."},{"key":"e_1_3_2_1_28_1","volume-title":"Berg","author":"Liu Wei","year":"2016","unstructured":"Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed, Cheng-Yang Fu, and Alexander C. Berg. 2016. SSD: Single Shot MultiBox Detector. 21--37 pages."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_30_1","volume-title":"Decoupled Weight Decay Regularization. In International Conference on Learning Representations (ICLR).","author":"Loshchilov Ilya","year":"2019","unstructured":"Ilya Loshchilov and Frank Hutter. 2019. Decoupled Weight Decay Regularization. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_31_1","article-title":"Visualizing data using t-SNE","volume":"9","author":"Maaten Laurens","year":"2008","unstructured":"Laurens Maaten and Geoffrey Hinton. 2008. Visualizing data using t-SNE. Journal of machine learning research, Vol. 9, 11 (2008).","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_32_1","unstructured":"Tomas Mikolov Kai Chen G. Corrado and J. Dean. 2013. Efficient Estimation of Word Representations in Vector Space. ICLR (2013)."},{"key":"e_1_3_2_1_33_1","volume-title":"Computer Vision and Pattern Recognition (CVPR) Workshop. 2188--2196","author":"Mishra Ashish","unstructured":"Ashish Mishra, M. Shiva Krishna Reddy, Anurag Mittal, and Hema A. Murthy. 2018. A Generative Model for Zero Shot Learning Using Conditional Variational Autoencoders. In Computer Vision and Pattern Recognition (CVPR) Workshop. 2188--2196."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00171"},{"key":"e_1_3_2_1_35_1","volume-title":"Representation learning with contrastive predictive coding. arXiv","author":"van den Oord Aaron","year":"2018","unstructured":"Aaron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation learning with contrastive predictive coding. arXiv (2018)."},{"key":"e_1_3_2_1_36_1","volume-title":"Conference on Empirical Methods in Natural Language Processing (EMNLP). 1532--1543","author":"Pennington Jeffrey","unstructured":"Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. Glove: Global Vectors for Word Representation.. In Conference on Empirical Methods in Natural Language Processing (EMNLP). 1532--1543."},{"key":"e_1_3_2_1_37_1","volume-title":"Learning Transferable Visual Models From Natural Language Supervision. In International Conference on Machine Learning (ICML). 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, A. Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In International Conference on Machine Learning (ICML). 8748--8763."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.5555\/1756006.1953015"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2861573"},{"key":"e_1_3_2_1_40_1","volume-title":"Zero-Shot Object Detection: Learning to Simultaneously Recognize and Localize Novel Concepts","author":"Rahman Shafin","unstructured":"Shafin Rahman, Salman Khan, and Fatih Porikli. 2019. Zero-Shot Object Detection: Learning to Simultaneously Recognize and Localize Novel Concepts. Springer International Publishing. 547--563 pages."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6868"},{"key":"e_1_3_2_1_42_1","volume-title":"Ross B. Girshick, and Ali Farhadi.","author":"Redmon Joseph","year":"2016","unstructured":"Joseph Redmon, Santosh Kumar Divvala, Ross B. Girshick, and Ali Farhadi. 2016. You Only Look Once: Unified, Real-Time Object Detection.. In Computer Vision and Pattern Recognition (CVPR). 779--788."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2577031"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"crossref","unstructured":"Hamid Rezatofighi Nathan Tsoi JunYoung Gwak Amir Sadeghian Ian D. Reid and Silvio Savarese. 2019. Generalized Intersection Over Union: A Metric and a Loss for Bounding Box Regression.. In Computer Vision and Pattern Recognition (CVPR). 658--666.","DOI":"10.1109\/CVPR.2019.00075"},{"key":"e_1_3_2_1_45_1","volume-title":"International Conference on Machine Learning (ICML). 2152--2161","author":"Romera-Paredes Bernardino","unstructured":"Bernardino Romera-Paredes and Philip H. S. Torr. 2015. An embarrassingly simple approach to zero-shot learning.. In International Conference on Machine Learning (ICML). 2152--2161."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"crossref","unstructured":"Olaf Ronneberger Philipp Fischer and Thomas Brox. 2015. U-Net: Convolutional Networks for Biomedical Image Segmentation. 234--241 pages.","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"e_1_3_2_1_47_1","volume-title":"British Machine Vision Conference (BMVC). 347","author":"Sarma Sandipan","year":"2022","unstructured":"Sandipan Sarma, Sushil Kumar, and Arijit Sur. 2022. Resolving Semantic Confusions for Improved Zero-Shot Detection.. In British Machine Vision Conference (BMVC). 347."},{"key":"e_1_3_2_1_48_1","volume-title":"Conference on Neural Information Processing Systems (NeurIPS). 5998--6008","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need.. In Conference on Neural Information Processing Systems (NeurIPS). 5998--6008."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"crossref","unstructured":"Yongqin Xian Bernt Schiele and Zeynep Akata. 2017. Zero-Shot Learning - The Good the Bad and the Ugly.. In Computer Vision and Pattern Recognition (CVPR). 3077--3086.","DOI":"10.1109\/CVPR.2017.328"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"crossref","unstructured":"Yongqin Xian Saurabh Sharma Bernt Schiele and Zeynep Akata. 2019. F-VAEGAN-D2: A Feature Generating Framework for Any-Shot Learning.. In Computer Vision and Pattern Recognition (CVPR). 10275--10284.","DOI":"10.1109\/CVPR.2019.01052"},{"key":"e_1_3_2_1_51_1","volume-title":"Semantics-Guided Contrastive Network for Zero-Shot Object detection","author":"Yan Caixia","year":"2021","unstructured":"Caixia Yan, Xiaojun Chang, Minnan Luo, Huan Liu, Xiaoqin Zhang, and Qinghua Zheng. 2021. Semantics-Guided Contrastive Network for Zero-Shot Object detection. IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI) (2021)."},{"key":"e_1_3_2_1_52_1","volume-title":"International Conference on Machine Learning (ICML). 7292--7303","author":"Zablocki Eloi","year":"2019","unstructured":"Eloi Zablocki, Patrick Bordes, Laure Soulier, Benjamin Piwowarski, and Patrick Gallinari. 2019. Context-Aware Zero-Shot Learning for Object Recognition.. In International Conference on Machine Learning (ICML). 7292--7303."},{"key":"e_1_3_2_1_53_1","volume-title":"DINO: DETR with Improved DeNoising Anchor Boxes for End-to-End Object Detection. ICLR 2023 poster","volume":"2203","author":"Zhang Hao","year":"2023","unstructured":"Hao Zhang, Feng Li, Shilong Liu, Lei Zhang, Hang Su, Jun Zhu, Lionel Ni, and Harry Shum. 2023. DINO: DETR with Improved DeNoising Anchor Boxes for End-to-End Object Detection. ICLR 2023 poster, Vol. abs\/2203.03605 (2023)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.474"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"crossref","unstructured":"Ziming Zhang and Venkatesh Saligrama. 2016. Zero-Shot Learning via Joint Latent Similarity Embedding. In Computer Vision and Pattern Recognition (CVPR). 6034--6042.","DOI":"10.1109\/CVPR.2016.649"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6996"},{"key":"e_1_3_2_1_57_1","volume-title":"Background Learnable Cascade for Zero-Shot Object Detection","author":"Zheng Ye","unstructured":"Ye Zheng, Ruoran Huang, Chuanqi Han, Xi Huang, and Li Cui. 2021b. Background Learnable Cascade for Zero-Shot Object Detection. Springer International Publishing. 107--123 pages."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475668"},{"key":"e_1_3_2_1_59_1","unstructured":"Pengkai Zhu Hanxiao Wang and Venkatesh Saligrama. 2020. Don't Even Look Once: Synthesizing Features for Zero-Shot Detection. In Computer Vision and Pattern Recognition (CVPR)."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612523","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612523","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T23:55:17Z","timestamp":1755820517000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612523"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":59,"alternative-id":["10.1145\/3581783.3612523","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612523","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}