{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T18:10:41Z","timestamp":1772907041635,"version":"3.50.1"},"publisher-location":"Cham","reference-count":46,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031263156","type":"print"},{"value":"9783031263163","type":"electronic"}],"license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023]]},"DOI":"10.1007\/978-3-031-26316-3_35","type":"book-chapter","created":{"date-parts":[[2023,3,1]],"date-time":"2023-03-01T08:02:32Z","timestamp":1677657752000},"page":"588-604","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":22,"title":["gScoreCAM: What Objects Is CLIP Looking At?"],"prefix":"10.1007","author":[{"given":"Peijie","family":"Chen","sequence":"first","affiliation":[]},{"given":"Qi","family":"Li","sequence":"additional","affiliation":[]},{"given":"Saad","family":"Biaz","sequence":"additional","affiliation":[]},{"given":"Trung","family":"Bui","sequence":"additional","affiliation":[]},{"given":"Anh","family":"Nguyen","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,3,2]]},"reference":[{"key":"35_CR1","unstructured":"Bommasani, R., et al.: On the opportunities and risks of foundation models. arXiv preprint arXiv:2108.07258 (2021)"},{"key":"35_CR2","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"35_CR3","unstructured":"yzhuoning: yzhuoning\/awesome-clip: Awesome list for research on clip (contrastive language-image pre-training) (2022). https:\/\/github.com\/yzhuoning\/Awesome-CLIP. Accessed 18 May 2022"},{"key":"35_CR4","doi-asserted-by":"crossref","unstructured":"Patashnik, O., Wu, Z., Shechtman, E., Cohen-Or, D., Lischinski, D.: Styleclip: text-driven manipulation of stylegan imagery. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2085\u20132094 (2021)","DOI":"10.1109\/ICCV48922.2021.00209"},{"key":"35_CR5","unstructured":"nerdyrodent: nerdyrodent\/vqgan-clip: Just playing with getting vqgan+clip running locally, rather than having to use colab (2022). https:\/\/github.com\/nerdyrodent\/VQGAN-CLIP. Accessed 18 May 2022"},{"key":"35_CR6","unstructured":"Kim, G., Ye, J.C.: Diffusionclip: text-guided image manipulation using diffusion models. arXiv preprint arXiv:2110.02711 (2021)"},{"key":"35_CR7","doi-asserted-by":"crossref","unstructured":"Luo, H., et al.: Clip4clip: an empirical study of clip for end to end video clip retrieval. arXiv preprint arXiv:2104.08860 (2021)","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"35_CR8","doi-asserted-by":"crossref","unstructured":"Lei, J., et al.: Less is more: clipbert for video-and-language learning via sparse sampling. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7331\u20137341 (2021)","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"35_CR9","doi-asserted-by":"crossref","unstructured":"Song, H., Dong, L., Zhang, W.N., Liu, T., Wei, F.: Clip models are few-shot learners: empirical studies on VQA and visual entailment. arXiv preprint arXiv:2203.07190 (2022)","DOI":"10.18653\/v1\/2022.acl-long.421"},{"key":"35_CR10","doi-asserted-by":"crossref","unstructured":"Kwon, G., Ye, J.C.: Clipstyler: image style transfer with a single text condition. arXiv preprint arXiv:2112.00374 (2021)","DOI":"10.1109\/CVPR52688.2022.01753"},{"key":"35_CR11","doi-asserted-by":"crossref","unstructured":"Vinker, Y., et al.: Clipasso: semantically-aware object sketching. arXiv preprint arXiv:2202.05822 (2022)","DOI":"10.1145\/3528223.3530068"},{"key":"35_CR12","doi-asserted-by":"crossref","unstructured":"Sheng, E., Chang, K.W., Natarajan, P., Peng, N.: The woman worked as a babysitter: on biases in language generation. arXiv preprint arXiv:1909.01326 (2019)","DOI":"10.18653\/v1\/D19-1339"},{"key":"35_CR13","unstructured":"Verge, T.: What a machine learning tool that turns obama white can (and can\u2019t) tell us about ai bias - the verge (2022). www.theverge.com\/21298762\/face-depixelizer-ai-machine-learning-tool-pulse-stylegan-obama-bias. Accessed 19 May 2022"},{"key":"35_CR14","doi-asserted-by":"crossref","unstructured":"Li, Q., Mai, L., Alcorn, M.A., Nguyen, A.: A cost-effective method for improving and re-purposing large, pre-trained GANs by fine-tuning their class-embeddings. In: Proceedings of the Asian Conference on Computer Vision (2020)","DOI":"10.1007\/978-3-030-69538-5_32"},{"key":"35_CR15","doi-asserted-by":"crossref","unstructured":"Phillips, P.J., Hahn, C.A., Fontana, P.C., Broniatowski, D.A., Przybocki, M.A.: Four principles of explainable artificial intelligence. Gaithersburg, Maryland (2020)","DOI":"10.6028\/NIST.IR.8312-draft"},{"key":"35_CR16","doi-asserted-by":"publisher","DOI":"10.23915\/distill.00030","volume":"6","author":"G Goh","year":"2021","unstructured":"Goh, G., et al.: Multimodal neurons in artificial neural networks. Distill 6, e30 (2021)","journal-title":"Distill"},{"key":"35_CR17","doi-asserted-by":"crossref","unstructured":"Chefer, H., Gur, S., Wolf, L.: Generic attention-model explainability for interpreting bi-modal and encoder-decoder transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 397\u2013406 (2021)","DOI":"10.1109\/ICCV48922.2021.00045"},{"key":"35_CR18","doi-asserted-by":"crossref","unstructured":"Subramanian, S., Merrill, W., Darrell, T., Gardner, M., Singh, S., Rohrbach, A.: Reclip: a strong zero-shot baseline for referring expression comprehension. arXiv preprint arXiv:2204.05991 (2022)","DOI":"10.18653\/v1\/2022.acl-long.357"},{"key":"35_CR19","doi-asserted-by":"crossref","unstructured":"Aflalo, E., et al.: VL-interpret: an interactive visualization tool for interpreting vision-language transformers. arXiv preprint arXiv:2203.17247 (2022)","DOI":"10.1109\/CVPR52688.2022.02072"},{"key":"35_CR20","unstructured":"vijishmadhavan: vijishmadhavan\/crop-clip: Crop using clip (2022). https:\/\/github.com\/vijishmadhavan\/Crop-CLIP. Accessed 23 May 2022"},{"key":"35_CR21","unstructured":"Kim, W., Son, B., Kim, I.: ViLT: vision-and-language transformer without convolution or region supervision. In: International Conference on Machine Learning, pp. 5583\u20135594. PMLR (2021)"},{"key":"35_CR22","doi-asserted-by":"crossref","unstructured":"Selvaraju, R.R., Cogswell, M., Das, A., Vedantam, R., Parikh, D., Batra, D.: Grad-cam: visual explanations from deep networks via gradient-based localization. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 618\u2013626 (2017)","DOI":"10.1109\/ICCV.2017.74"},{"key":"35_CR23","doi-asserted-by":"crossref","unstructured":"Chattopadhay, A., Sarkar, A., Howlader, P., Balasubramanian, V.N.: Grad-CAM++: generalized gradient-based visual explanations for deep convolutional networks. In: 2018 IEEE Winter Conference on Applications of Computer Vision (WACV), pp. 839\u2013847. IEEE (2018)","DOI":"10.1109\/WACV.2018.00097"},{"key":"35_CR24","unstructured":"Petsiuk, V., Das, A., Saenko, K.: Rise: randomized input sampling for explanation of black-box models. arXiv preprint arXiv:1806.07421 (2018)"},{"key":"35_CR25","unstructured":"Nguyen, A., Yosinski, J., Clune, J.: Multifaceted feature visualization: uncovering the different types of features learned by each neuron in deep neural networks. arXiv preprint arXiv:1602.03616 (2016)"},{"key":"35_CR26","doi-asserted-by":"crossref","unstructured":"Materzy\u0144ska, J., Torralba, A., Bau, D.: Disentangling visual and written concepts in clip. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16410\u201316419 (2022)","DOI":"10.1109\/CVPR52688.2022.01592"},{"key":"35_CR27","doi-asserted-by":"crossref","unstructured":"Wang, H., et al.: Score-CAM: score-weighted visual explanations for convolutional neural networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops, pp. 24\u201325 (2020)","DOI":"10.1109\/CVPRW50498.2020.00020"},{"key":"35_CR28","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., et al.: Imagenet large scale visual recognition challenge. Int. J. Comput. Vision 115, 211\u2013252 (2015)","journal-title":"Int. J. Comput. Vision"},{"key":"35_CR29","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"35_CR30","unstructured":"He, J., et al.: Partimagenet: a large, high-quality dataset of parts. arXiv preprint arXiv:2112.00933 (2021)"},{"key":"35_CR31","doi-asserted-by":"crossref","unstructured":"Chefer, H., Gur, S., Wolf, L.: Transformer interpretability beyond attention visualization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 782\u2013791 (2021)","DOI":"10.1109\/CVPR46437.2021.00084"},{"key":"35_CR32","doi-asserted-by":"crossref","unstructured":"Zhou, B., Khosla, A., Lapedriza, A., Oliva, A., Torralba, A.: Learning deep features for discriminative localization. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.319"},{"key":"35_CR33","doi-asserted-by":"crossref","unstructured":"Ribeiro, M.T., Singh, S., Guestrin, C.: \u201cWhy should i trust you?\u201d explaining the predictions of any classifier. In: Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp. 1135\u20131144 (2016)","DOI":"10.1145\/2939672.2939778"},{"key":"35_CR34","doi-asserted-by":"crossref","unstructured":"Agarwal, C., Nguyen, A.: Explaining image classifiers by removing input features using generative models. In: Proceedings of the Asian Conference on Computer Vision (2020)","DOI":"10.1007\/978-3-030-69544-6_7"},{"key":"35_CR35","unstructured":"Simonyan, K., Vedaldi, A., Zisserman, A.: Deep inside convolutional networks: visualising image classification models and saliency maps. arXiv preprint arXiv:1312.6034 (2013)"},{"key":"35_CR36","unstructured":"Nourelahi, M., Kotthoff, L., Chen, P., Nguyen, A.: How explainable are adversarially-robust CNNs? arXiv preprint arXiv:2205.13042 (2022)"},{"key":"35_CR37","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"35_CR38","unstructured":"Recht, B., Roelofs, R., Schmidt, L., Shankar, V.: Do imagenet classifiers generalize to imagenet? In: International Conference on Machine Learning, pp. 5389\u20135400. PMLR (2019)"},{"key":"35_CR39","doi-asserted-by":"crossref","unstructured":"Choe, J., Oh, S.J., Lee, S., Chun, S., Akata, Z., Shim, H.: Evaluating weakly supervised object localization methods right. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3133\u20133142 (2020)","DOI":"10.1109\/CVPR42600.2020.00320"},{"key":"35_CR40","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"752","DOI":"10.1007\/978-3-030-58580-8_44","volume-title":"Computer Vision \u2013 ECCV 2020","author":"T Gupta","year":"2020","unstructured":"Gupta, T., Vahdat, A., Chechik, G., Yang, X., Kautz, J., Hoiem, D.: Contrastive learning for weakly supervised phrase grounding. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12348, pp. 752\u2013768. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58580-8_44"},{"key":"35_CR41","unstructured":"Gildenblat, J., contributors: Pytorch library for cam methods (2021). https:\/\/github.com\/jacobgil\/pytorch-grad-cam"},{"key":"35_CR42","unstructured":"OpenAI: openai\/clip: Contrastive language-image pretraining (2022). https:\/\/github.com\/openai\/CLIP. Accessed 06 July 2022"},{"key":"35_CR43","doi-asserted-by":"publisher","first-page":"259","DOI":"10.1016\/0167-2789(92)90242-F","volume":"60","author":"L Radin","year":"1992","unstructured":"Radin, L., Osher, S., Fatemi, E.: Non-linear total variation noise removal algorithm. Phys. D 60, 259\u2013268 (1992)","journal-title":"Phys. D"},{"key":"35_CR44","unstructured":"Fu, R., Hu, Q., Dong, X., Guo, Y., Gao, Y., Li, B.: Axiom-based grad-CAM: towards accurate visualization and explanation of CNNs. arXiv preprint arXiv:2008.02312 (2020)"},{"key":"35_CR45","doi-asserted-by":"publisher","first-page":"5875","DOI":"10.1109\/TIP.2021.3089943","volume":"30","author":"PT Jiang","year":"2021","unstructured":"Jiang, P.T., Zhang, C.B., Hou, Q., Cheng, M.M., Wei, Y.: Layercam: exploring hierarchical class activation maps for localization. IEEE Trans. Image Process. 30, 5875\u20135888 (2021)","journal-title":"IEEE Trans. Image Process."},{"key":"35_CR46","doi-asserted-by":"crossref","unstructured":"Zhang, Q., Rao, L., Yang, Y.: Group-CAM: group score-weighted visual explanations for deep convolutional networks. arXiv preprint arXiv:2103.13859 (2021)","DOI":"10.1109\/CVPRW50498.2020.00020"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ACCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-26316-3_35","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,15]],"date-time":"2024-10-15T12:48:59Z","timestamp":1728996539000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-26316-3_35"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"ISBN":["9783031263156","9783031263163"],"references-count":46,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-26316-3_35","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023]]},"assertion":[{"value":"2 March 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ACCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Asian Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Macao","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 December 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 December 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"accv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.accv2022.org","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT Microsoft","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"836","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"277","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"33% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"2.6","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"For the ACCV 2022 workshops 25 papers have been accepted from 40 submissions","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}