{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,16]],"date-time":"2026-06-16T05:18:31Z","timestamp":1781587111766,"version":"3.54.5"},"reference-count":150,"publisher":"Tsinghua University Press","issue":"1","license":[{"start":{"date-parts":[[2022,3,1]],"date-time":"2022-03-01T00:00:00Z","timestamp":1646092800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0"},{"start":{"date-parts":[[2021,10,27]],"date-time":"2021-10-27T00:00:00Z","timestamp":1635292800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Comp. Visual. Med."],"published-print":{"date-parts":[[2022,3]]},"DOI":"10.1007\/s41095-021-0247-3","type":"journal-article","created":{"date-parts":[[2021,10,27]],"date-time":"2021-10-27T12:03:16Z","timestamp":1635336196000},"page":"33-62","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":127,"title":["Transformers in computational visual media: A survey"],"prefix":"10.26599","volume":"8","author":[{"given":"Yifan","family":"Xu","sequence":"first","affiliation":[{"name":"NLPR, Institute of Automation, Chinese Academy of Sciences, Beijing 100190, China; School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing 100040, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Huapeng","family":"Wei","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, Jilin University, Changchun 130012, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Minxuan","family":"Lin","sequence":"additional","affiliation":[{"name":"NLPR, Institute of Automation, Chinese Academy of Sciences, Beijing 100190, China; School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing 100040, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yingying","family":"Deng","sequence":"additional","affiliation":[{"name":"NLPR, Institute of Automation, Chinese Academy of Sciences, Beijing 100190, China; School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing 100040, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Kekai","family":"Sheng","sequence":"additional","affiliation":[{"name":"Youtu Lab, Tencent Inc., Shanghai 200233, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Mengdan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Youtu Lab, Tencent Inc., Shanghai 200233, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Fan","family":"Tang","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, Jilin University, Changchun 130012, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Weiming","family":"Dong","sequence":"additional","affiliation":[{"name":"NLPR, Institute of Automation, Chinese Academy of Sciences, Beijing 100190, China; School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing 100040, China; CASIA-LLVISION Joint Lab, Beijing 100190, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Feiyue","family":"Huang","sequence":"additional","affiliation":[{"name":"Youtu Lab, Tencent Inc., Shanghai 200233, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Changsheng","family":"Xu","sequence":"additional","affiliation":[{"name":"NLPR, Institute of Automation, Chinese Academy of Sciences, Beijing 100190, China; School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing 100040, China; CASIA-LLVISION Joint Lab, Beijing 100190, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"11138","reference":[{"key":"247_CR1","doi-asserted-by":"crossref","unstructured":"He, K. M.; Zhang, X. Y.; Ren, S. Q.; Sun, J. Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 770\u2013778, 2016.","DOI":"10.1109\/CVPR.2016.90"},{"key":"247_CR2","unstructured":"Tan, M.; Le, Q. EfficientNet: Rethinking model scaling for convolutional neural networks. In: Proceedings of the 36th International Conference on Machine Learning, 2019."},{"key":"247_CR3","doi-asserted-by":"crossref","unstructured":"Radosavovic, I.; Kosaraju, R. P.; Girshick, R.; He, K. M.; Doll\u00e1r, P. Designing network design spaces. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 10425\u201310433, 2020.","DOI":"10.1109\/CVPR42600.2020.01044"},{"key":"247_CR4","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"191","DOI":"10.1007\/978-3-030-58555-6_12","volume-title":"Computer Vision-ECCV 2020","author":"M H Yin","year":"2020","unstructured":"Yin, M. H.; Yao, Z. L.; Cao, Y.; Li, X.; Zhang, Z.; Lin, S.; Hu, H. Disentangled non-local neural networks. In: Computer Vision-ECCV 2020. Lecture Notes in Computer Science, Vol. 12360. Vedaldi, A.; Bischof, H.; Brox, T.; Frahm, J. M. Eds. Springer Cham, 191\u2013207, 2020."},{"key":"247_CR5","doi-asserted-by":"crossref","unstructured":"Hu, H.; Gu, J. Y.; Zhang, Z.; Dai, J. F.; Wei, Y. C. Relation networks for object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 3588\u20133597, 2018.","DOI":"10.1109\/CVPR.2018.00378"},{"key":"247_CR6","doi-asserted-by":"crossref","unstructured":"Wang, X. L.; Girshick, R.; Gupta, A.; He, K. M. Non-local neural networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 7794\u20137803, 2018.","DOI":"10.1109\/CVPR.2018.00813"},{"key":"247_CR7","doi-asserted-by":"crossref","unstructured":"Hu, H.; Zhang, Z.; Xie, Z. D.; Lin, S. Local relation networks for image recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 3463\u20133472, 2019.","DOI":"10.1109\/ICCV.2019.00356"},{"key":"247_CR8","unstructured":"Yuan, Y.; Huang, L.; Guo, J.; Zhang, C.; Chen, X.; Wang, J. OCNet: Object context network for scene parsing. arXiv preprint arXiv:1809.00916, 2018."},{"key":"247_CR9","unstructured":"Dosovitskiy, A.; Beyer, L.; Kolesnikov, A.; Weissenborn, D.; Zhai, X.; Unterthiner, T.; Dehghani, M.; Minderer, M.; Heigold, G.; Gelly, S.; Uszkoreit, J.; Houlsby, N. An image is worth 16\u00d716 words: Transformers for image recognition at scale. In: Proceedings of the International Conference on Learning Representations, 2021."},{"key":"247_CR10","unstructured":"Devlin, J.; Chang, M.-W.; Lee, K.; Toutanova, K. BERT: Pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, 4171\u20134186, 2019."},{"key":"247_CR11","unstructured":"Chen, M.; Radford, A.; Child, R.; Wu, J.; Jun, H.; Luan, D.; Sutskever, I. Generative pretraining from pixels. In: Proceedings of the 37th International Conference on Machine Learning, 1691\u20131703, 2020."},{"key":"247_CR12","unstructured":"Graham, B.; El-Nouby, A.; Touvron, H.; Stock, P.; Joulin, A.; J\u00e9gou, H.; Douze, M. LeViT: A vision transformer in ConvNet\u2019s clothing for faster inference. arXiv preprint arXiv:2104.01136, 2021."},{"key":"247_CR13","unstructured":"Tay, Y.; Dehghani, M.; Bahri, D.; Metzler, D. Efficient transformers: A survey. arXiv preprint arXiv:2009.06732, 2020."},{"key":"247_CR14","unstructured":"Liang, J.; Hu, D.; He, R.; Feng, J. Distill and fine-tune: Effective adaptation from a black-box source model. arXiv preprint arXiv:2104.01539, 2021."},{"key":"247_CR15","unstructured":"Yuan, L.; Chen, Y.; Wang, T.; Yu, W.; Shi, Y.; Tay, F. E.; Feng, J.; Yan, S. Tokens-to-Token ViT: Training vision transformers from scratch on ImageNet. arXiv preprint arXiv:2101.11986, 2021."},{"key":"247_CR16","unstructured":"Han, K.; Xiao, A.; Wu, E.; Guo, J.; Xu, C.; Wang, Y. Transformer in transformer. arXiv preprint arXiv:2103.00112, 2021."},{"key":"247_CR17","unstructured":"Chu, X. X.; Tian, Z.; Zhang, B.; Wang, X. L.; Wei, X. L.; Xia, H. X.; Shen, C. Conditional positional encodings for vision transformers. arXiv preprint arXiv:2102.10882, 2021."},{"key":"247_CR18","unstructured":"D\u2019Ascoli, S.; Touvron, H.; Leavitt, M. L.; Morcos, A. S.; Biroli, G.; Sagun, L. ConViT: Improving vision transformers with soft convolutional inductive biases. In: Proceedings of the 38th International Conference on Machine Learning, 2286\u20132296, 2021."},{"key":"247_CR19","unstructured":"Zhou, D.; Kang, B.; Jin, X.; Yang, L.; Lian, X.; Hou, Q.; Feng, J. DeepViT: Towards deeper vision transformer. arXiv preprint arXiv:2103.11886, 2021."},{"key":"247_CR20","unstructured":"Liu, Z.; Lin, Y. T.; Cao, Y.; Hu, H.; Guo, B. N. Swin transformer: Hierarchical vision transformer using shifted windows. arXiv preprint arXiv:2103.14030, 2021."},{"key":"247_CR21","unstructured":"Heo, B.; Yun, S.; Han, D.; Chun, S.; Oh, S. J. Rethinking spatial dimensions of vision transformers. arXiv preprint arXiv:2103.16302, 2021."},{"key":"247_CR22","unstructured":"Li, Y. W.; Zhang, K.; Cao, J. Z.; Timofte, R.; Gool, L. V. LocalViT: Bringing locality to vision transformers. arXiv preprint arXiv:2104.05707, 2021."},{"key":"247_CR23","doi-asserted-by":"crossref","unstructured":"Chefer, H.; Gur, S.; Wolf, L. Transformer interpretability beyond attention visualization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 782\u2013791, 2021.","DOI":"10.1109\/CVPR46437.2021.00084"},{"key":"247_CR24","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision-ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N.; Massa, F.; Synnaeve, G.; Usunier, N.; Kirillov, A.; Zagoruyko, S. End-to-end object detection with transformers. In: Computer Vision-ECCV 2020. Lecture Notes in Computer Science, Vol. 12346. Vedaldi, A.; Bischof, H.; Brox, T.; Frahm, J. M. Eds. Springer Cham, 213\u2013229, 2020."},{"key":"247_CR25","unstructured":"Zhu, X. Z.; Su, W. J.; Lu, L. W.; Li, B.; Dai, J. F. Deformable DETR: Deformable transformers for end-to-end object detection. arXiv preprint arXiv:2010.04159, 2020."},{"key":"247_CR26","unstructured":"Dai, Z. G.; Cai, B. L.; Lin, Y. G.; Chen, J. Y. UP-DETR: Unsupervised pre-training for object detection with transformers. arXiv preprint arXiv:2011.09094, 2020."},{"key":"247_CR27","unstructured":"Wang, W.; Xie, E.; Li, X.; Fan, D.-P.; Song, K.; Liang, D.; Lu, T.; Luo, P.; Shao. L. Pyramid vision transformer: A versatile backbone for dense prediction without convolutions. arXiv preprint arXiv:2102.12122, 2021."},{"key":"247_CR28","doi-asserted-by":"crossref","unstructured":"Wang, Y.; Xu, Z.; Wang, X.; Shen, C.; Cheng, B.; Shen, H.; Xia, H. End-to-end video instance segmentation with transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 8741\u20138750, 2021.","DOI":"10.1109\/CVPR46437.2021.00863"},{"key":"247_CR29","unstructured":"Xie, E.; Wang, W.; Yu, Z.; Anandkumar, A.; Alvarez, J. M.; Luo, P. SegFormer: Simple and efficient design for semantic segmentation with transformers. arXiv preprint arXiv:2105.15203, 2021."},{"key":"247_CR30","unstructured":"Kumar, M.; Weissenborn, D.; Kalchbrenner, N. Colorization transformer. In: Proceedings of the 9th International Conference on Learning Representations, 2021."},{"key":"247_CR31","unstructured":"Liu, B. C.; Song, K. P.; Zhu, Y. Z.; de Melo, G.; Elgammal, A. TIME: Text and image mutual-translation adversarial networks. In: Proceedings of the 35th AAAI Conference on Artificial Intelligence, 2082\u20132090, 2021."},{"key":"247_CR32","unstructured":"Ramesh, A.; Pavlov, M.; Goh, G.; Gray, S.; Voss, C.; Radford, A.; Chen, M.; Sutskever, I. Zero-shot text-to-image generation. arXiv preprint arXiv:2102.12092, 2021."},{"key":"247_CR33","doi-asserted-by":"crossref","unstructured":"Yang, F. Z.; Yang, H.; Fu, J. L.; Lu, H. T.; Guo, B. N. Learning texture transformer network for image super-resolution. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 5790\u20135799, 2020.","DOI":"10.1109\/CVPR42600.2020.00583"},{"key":"247_CR34","unstructured":"Jiang, Y. F.; Chang, S. Y.; Wang, Z. Y. TransGAN: Two transformers can make one strong GAN. arXiv preprint arXiv:2102.07074, 2021."},{"key":"247_CR35","unstructured":"Hudson, D. A.; Zitnick, C. L. Generative adversarial transformers. arXiv preprint arXiv:2103.01209, 2021."},{"key":"247_CR36","unstructured":"Van den Oord, A.; Vinyals, O.; Kavukcuoglu, K. Neural discrete representation learning. In: Proceedings of the 31st International Conference on Neural Information Processing Systems, 6309\u20136318, 2017."},{"key":"247_CR37","unstructured":"Wang, Z.; Cun, X.; Bao, J.; Liu, J. Uformer: A general U-shaped transformer for image restoration. arXiv preprint arXiv:2106.03106, 2021."},{"key":"247_CR38","unstructured":"Deng, Y. Y.; Tang, F.; Pan, X. J.; Dong, W. M.; Xu, C. S. StyTr2: Unbiased image style transfer with transformers. arXiv preprint arXiv:2105.14576, 2021."},{"issue":"2","key":"247_CR39","doi-asserted-by":"publisher","first-page":"187","DOI":"10.1007\/s41095-021-0229-5","volume":"7","author":"M-H Guo","year":"2021","unstructured":"Guo, M.-H.; Cai, J.-X.; Liu, Z.-N.; Mu, T.-J.; Martin, R. R.; Hu, S.-M. PCT: Point cloud transformer. Computational Visual Media Vol. 7, No. 2, 187\u2013199, 2021.","journal-title":"Computational Visual Media"},{"key":"247_CR40","unstructured":"Lu, J.; Batra, D.; Parikh, D.; Lee, S. ViLBERT: Pre-training task-agnostic visiolinguistic representations for vision-and-language tasks. In: Proceedings of the 33rd Conference on Neural Information Processing Systems, 13\u201323, 2019."},{"key":"247_CR41","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"104","DOI":"10.1007\/978-3-030-58577-8_7","volume-title":"Computer Vision-ECCV 2020","author":"Y-C Chen","year":"2020","unstructured":"Chen, Y.-C.; Li, L. J.; Yu, L. C.; El Kholy, A.; Ahmed, F.; Gan, Z.; Cheng, Y.; Liu, J. UNITER: UNiversal image-TExt representation learning. In: Computer Vision-ECCV 2020. Lecture Notes in Computer Science, Vol. 12375. Vedaldi, A.; Bischof, H.; Brox, T.; Frahm, J. M. Eds. Springer Cham, 104\u2013120, 2020."},{"key":"247_CR42","unstructured":"Li, C. L.; Yan, M.; Xu, H. Y.; Luo, F. L.; Huang, S. F. SemVLP: Vision-language pre-training by aligning semantics at multiple levels. arXiv preprint arXiv:2103.07829, 2021."},{"key":"247_CR43","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"649","DOI":"10.1007\/978-3-319-46487-9_40","volume-title":"Computer Vision-ECCV 2016","author":"R Zhang","year":"2016","unstructured":"Zhang, R.; Isola, P.; Efros, A. A. Colorful image colorization. In: Computer Vision-ECCV 2016. Lecture Notes in Computer Science, Vol. 9907. Leibe, B.; Matas, J.; Sebe, N.; Welling, M. Eds. Springer Cham, 649\u2013666, 2016."},{"key":"247_CR44","doi-asserted-by":"crossref","unstructured":"Zhang, R.; Zhu, J.-Y.; Isola, P.; Geng, X. Y.; Lin, A. S.; Yu, T. H.; Efros, A. A. Real-time user-guided image colorization with learned deep priors. arXiv preprint arXiv:1705.02999, 2017.","DOI":"10.1145\/3072959.3073703"},{"key":"247_CR45","doi-asserted-by":"crossref","unstructured":"Su, J.-W.; Chu, H.-K.; Huang, J.-B. Instance-aware image colorization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 7965\u20137974, 2020.","DOI":"10.1109\/CVPR42600.2020.00799"},{"key":"247_CR46","doi-asserted-by":"crossref","unstructured":"Pang, L.; Lan, Y.; Guo, J.; Xu, J.; Wan, S.; Cheng, X. Text matching as image recognition. In: Proceedings of the 30th AAAI Conference on Artificial Intelligence, 2793\u20132799, 2016.","DOI":"10.1609\/aaai.v30i1.10341"},{"issue":"2","key":"247_CR47","doi-asserted-by":"publisher","first-page":"295","DOI":"10.1109\/TPAMI.2015.2439281","volume":"38","author":"C Dong","year":"2016","unstructured":"Dong, C.; Loy, C. C.; He, K. M.; Tang, X. O. Image super-resolution using deep convolutional networks. IEEE Transactions on Pattern Analysis and Machine Intelligence Vol. 38, No. 2, 295\u2013307, 2016.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"247_CR48","doi-asserted-by":"crossref","unstructured":"Zhang, Y. L.; Tian, Y. P.; Kong, Y.; Zhong, B. N.; Fu, Y. Residual dense network for image super-resolution. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2472\u20132481, 2018.","DOI":"10.1109\/CVPR.2018.00262"},{"key":"247_CR49","doi-asserted-by":"crossref","unstructured":"Haris, M.; Shakhnarovich, G.; Ukita, N. Deep back-projection networks for super-resolution. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 1664\u20131673, 2018.","DOI":"10.1109\/CVPR.2018.00179"},{"key":"247_CR50","unstructured":"Chen, X.; Duan, Y.; Houthooft, R.; Schulman, J.; Sutskever, I.; Abbeel, P. InfoGAN: Interpretable representation learning by information maximizing generative adversarial nets. arXiv preprint arXiv:1606.03657, 2016."},{"key":"247_CR51","unstructured":"Salimans, T.; Goodfellow, I.; Zaremba, W.; Cheung, V.; Radford, A.; Chen, X. Improved techniques for training GANs. arXiv preprint arXiv:1606.03498, 2016."},{"key":"247_CR52","unstructured":"Heusel, M.; Ramsauer, H.; Unterthiner, T.; Nessler, B.; Hochreiter, S. GANs trained by a two time-scale update rule converge to a local Nash equilibrium. arXiv preprint arXiv:1706.08500, 2017."},{"key":"247_CR53","doi-asserted-by":"crossref","unstructured":"Karras, T.; Laine, S.; Aila, T. M. A style-based generator architecture for generative adversarial networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 4396\u20134405, 2019.","DOI":"10.1109\/CVPR.2019.00453"},{"key":"247_CR54","unstructured":"Gulrajani, I.; Ahmed, F.; Arjovsky, M.; Dumoulin, V.; Courville, A. Improved training of wasserstein GANs. arXiv preprint arXiv:1704.00028, 2017."},{"issue":"4","key":"247_CR55","doi-asserted-by":"publisher","first-page":"27","DOI":"10.1109\/45.329294","volume":"13","author":"G Bebis","year":"1994","unstructured":"Bebis, G.; Georgiopoulos, M. Feed-forward neural networks. IEEE Potentials Vol. 13, No. 4, 27\u201331, 1994.","journal-title":"IEEE Potentials"},{"key":"247_CR56","unstructured":"Ba, J. L.; Kiros, J. R.; Hinton, G. E. Layer normalization. arXiv preprint arXiv:1607.06450, 2016."},{"key":"247_CR57","unstructured":"Vaswani, A.; Shazeer, N.; Parmar, N.; Uszkoreit, J.; Jones, L.; Gomez, A. N.; Kaiser, L.; Polosukhin, I. Attention is all you need. In: Proceedings of the 31st International Conference on Neural Information Processing Systems, 6000\u20136010, 2017."},{"key":"247_CR58","unstructured":"Hendrycks, D.; Gimpel, K. Gaussian error linear units (GELUs). arXiv preprint arXiv:1606.08415, 2016."},{"key":"247_CR59","unstructured":"Kitaev, N.; Kaiser, L.; Levskaya, A. Reformer: The efficient transformer. In: Proceedings of the International Conference on Learning Representations, 2020."},{"key":"247_CR60","unstructured":"Choromanski, K. M.; Likhosherstov, V.; Dohan, D.; Song, X.; Gane, A.; Sarlos, T.; Hawkins, P.; Davis, J. Q.; Mohiuddin, A.; Kaiser, L. et al. Rethinking attention with performers. In: Proceedings of the International Conference on Learning Representations, 2021."},{"key":"247_CR61","unstructured":"Wang, S.; Li, B.; Khabsa, M.; Fang, H.; Ma, H. Linformer: Self-attention with linear complexity. arXiv preprint arXiv:2006.04768, 2020."},{"key":"247_CR62","doi-asserted-by":"crossref","unstructured":"Abnar, S.; Zuidema, W. Quantifying attention flow in transformers. arXiv preprint arXiv:2005.00928, 2020.","DOI":"10.18653\/v1\/2020.acl-main.385"},{"key":"247_CR63","doi-asserted-by":"crossref","unstructured":"Voita, E.; Talbot, D.; Moiseev, F.; Sennrich, R.; Titov, I. Analyzing multi-head self-attention: Specialized heads do the heavy lifting, the rest can be pruned. In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, 5797\u20135808, 2019.","DOI":"10.18653\/v1\/P19-1580"},{"issue":"3","key":"247_CR64","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O.; Deng, J.; Su, H.; Krause, J.; Satheesh, S.; Ma, S. A.; Huang, Z.; Karpathy, A.; Khosla, A.; Bernstein, M. et al. ImageNet large scale visual recognition challenge. International Journal of Computer Vision Vol. 115, No. 3, 211\u2013252, 2015.","journal-title":"International Journal of Computer Vision"},{"key":"247_CR65","unstructured":"Touvron, H.; Cord, M.; Douze, M.; Massa, F.; Sablayrolles, A.; Jegou, H. Training data-efficient image transformers & distillation through attention. In: Proceedings of the 38th International Conference on Machine Learning, 10347\u201310357, 2021."},{"key":"247_CR66","unstructured":"Han, Y. Z.; Huang, G.; Song, S. J.; Yang, L.; Wang, Y. L. Dynamic neural networks: A survey. arXiv preprint arXiv:2102.04906, 2021."},{"key":"247_CR67","unstructured":"Xu, W.; Xu, Y.; Chang, T.; Tu, Z. Co-scale conv-attentional image transformers. arXiv preprint arXiv:2104.06399, 2021."},{"key":"247_CR68","unstructured":"Dong, X. Y.; Bao, J. M.; Chen, D. D.; Zhang, W. M.; Yu, N. H.; Yuan, L.; Chen, D.; Guo, B. CSWin transformer: A general vision transformer backbone with cross-shaped windows. arXiv preprint arXiv:2107.00652, 2021."},{"key":"247_CR69","doi-asserted-by":"crossref","unstructured":"Huang, Z. L.; Wang, X. G.; Huang, L. C.; Huang, C.; Wei, Y. C.; Liu, W. CCNet: Criss-cross attention for semantic segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 603\u2013612, 2019.","DOI":"10.1109\/ICCV.2019.00069"},{"key":"247_CR70","doi-asserted-by":"crossref","unstructured":"Hou, Q. B.; Zhang, L.; Cheng, M. M.; Feng, J. S. Strip pooling: Rethinking spatial pooling for scene parsing. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 4002\u20134011, 2020.","DOI":"10.1109\/CVPR42600.2020.00406"},{"key":"247_CR71","unstructured":"Touvron, H.; Cord, M.; Sablayrolles, A.; Synnaeve, G.; Jegou, H. Going deeper with image transformers. arXiv preprint arXiv:2103.17239, 2021."},{"key":"247_CR72","doi-asserted-by":"crossref","unstructured":"Selvaraju, R. R.; Cogswell, M.; Das, A.; Vedantam, R.; Parikh, D.; Batra, D. Grad-CAM: Visual explanations from deep networks via gradient-based localization. In: Proceedings of the IEEE International Conference on Computer Vision, 618\u2013626, 2017.","DOI":"10.1109\/ICCV.2017.74"},{"key":"247_CR73","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"63","DOI":"10.1007\/978-3-319-44781-0_8","volume-title":"Artificial Neural Networks and Machine Learning-ICANN 2016","author":"A Binder","year":"2016","unstructured":"Binder, A.; Montavon, G.; Lapuschkin, S.; M\u00fcller, K.-R.; Samek, W. Layer-wise relevance propagation for neural networks with local renormalization layers. In: Artificial Neural Networks and Machine Learning-ICANN 2016. Lecture Notes in Computer Science, Vol. 9887. Villa, A.; Masulli, P.; Pons Rivero, A. Eds. Springer Cham, 63\u201371, 2016."},{"key":"247_CR74","doi-asserted-by":"crossref","unstructured":"Zheng, S.; Lu, J.; Zhao, H.; Zhu, X.; Luo, Z.; Wang, Y.; Fu, Y.; Feng, J.; Xiang, T.; Torr, P. H. et al. Rethinking semantic segmentation from a sequence-tosequence perspective with transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 6881\u20136890, 2021.","DOI":"10.1109\/CVPR46437.2021.00681"},{"key":"247_CR75","unstructured":"Duke, B.; Ahmed, A.; Wolf, C.; Aarabi, P.; Taylor, G. W. SSTVOS: Sparse spatiotemporal transformers for video object segmentation. arXiv preprint arXiv:2101.08833, 2021."},{"key":"247_CR76","unstructured":"Chen, J. N.; Lu, Y. Y.; Yu, Q. H.; Luo, X. D.; Zhou, Y. Y. TransUNet: Transformers make strong encoders for medical image segmentation. arXiv preprint arXiv:2102.04306, 2021."},{"key":"247_CR77","doi-asserted-by":"crossref","unstructured":"Ye, L. W.; Rochan, M.; Liu, Z.; Wang, Y. Cross-modal self-attention network for referring image segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 10494\u201310503, 2019.","DOI":"10.1109\/CVPR.2019.01075"},{"key":"247_CR78","unstructured":"Wang, H.; Zhu, Y.; Adam, H.; Yuille, A.; Chen, L.-C. Max-deeplab: End-to-end panoptic segmentation with mask transformers. arXiv preprint arXiv:2012.00759, 2020."},{"key":"247_CR79","unstructured":"Durner, M.; Boerdijk, W.; Sundermeyer, M.; Friedl, W.; Marton, Z.-C.; Triebel, R. Unknown object segmentation from stereo images. arXiv preprint arXiv:2103.06796, 2021."},{"issue":"1","key":"247_CR80","doi-asserted-by":"publisher","first-page":"172","DOI":"10.1109\/TPAMI.2019.2929257","volume":"43","author":"Z Cao","year":"2021","unstructured":"Cao, Z.; Hidalgo, G.; Simon, T.; Wei, S.-E.; Sheikh, Y. OpenPose: Realtime multi-person 2D pose estimation using part affinity fields. IEEE Transactions on Pattern Analysis and Machine Intelligence Vol. 43, No. 1, 172\u2013186, 2021.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"247_CR81","doi-asserted-by":"crossref","unstructured":"Simon, T.; Joo, H.; Matthews, I.; Sheikh, Y. Hand keypoint detection in single images using multiview bootstrapping. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 4645\u20134653, 2017.","DOI":"10.1109\/CVPR.2017.494"},{"key":"247_CR82","doi-asserted-by":"crossref","unstructured":"Cao, Z.; Simon, T.; Wei, S.-E.; Sheikh, Y. Realtime multi-person 2D pose estimation using part affinity fields. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 1302\u20131310, 2017.","DOI":"10.1109\/CVPR.2017.143"},{"key":"247_CR83","doi-asserted-by":"crossref","unstructured":"Fang, H.-S.; Xie, S. Q.; Tai, Y.-W.; Lu, C. W. RMPE: Regional multi-person pose estimation. In: Proceedings of the IEEE International Conference on Computer Vision, 2353\u20132362, 2017.","DOI":"10.1109\/ICCV.2017.256"},{"key":"247_CR84","doi-asserted-by":"crossref","unstructured":"Zhang, F.; Zhu, X. T.; Dai, H. B.; Ye, M.; Zhu, C. Distribution-aware coordinate representation for human pose estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 7091\u20137100, 2020.","DOI":"10.1109\/CVPR42600.2020.00712"},{"key":"247_CR85","doi-asserted-by":"crossref","unstructured":"Sun, K.; Xiao, B.; Liu, D.; Wang, J. D. Deep high-resolution representation learning for human pose estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 5686\u20135696, 2019.","DOI":"10.1109\/CVPR.2019.00584"},{"key":"247_CR86","unstructured":"Ren, S.; He, K.; Girshick, R.; Sun, J. Faster R-CNN: Towards real-time object detection with region proposal networks. arXiv preprint arXiv:1506.01497, 2015."},{"issue":"5","key":"247_CR87","doi-asserted-by":"publisher","first-page":"1483","DOI":"10.1109\/TPAMI.2019.2956516","volume":"43","author":"Z W Cai","year":"2021","unstructured":"Cai, Z. W.; Vasconcelos, N. Cascade R-CNN: High quality object detection and instance segmentation. IEEE Transactions on Pattern Analysis and Machine Intelligence Vol. 43, No. 5, 1483\u20131498, 2021.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"247_CR88","doi-asserted-by":"crossref","unstructured":"Lin, T. Y.; Goyal, P.; Girshick, R.; He, K. M.; Doll\u00e1r, P. Focal loss for dense object detection. In: Proceedings of the IEEE International Conference on Computer Vision, 2999\u20133007, 2017.","DOI":"10.1109\/ICCV.2017.324"},{"key":"247_CR89","unstructured":"Zhou, X.; Wang, D.; Kr\u00e4henb\u00fchl, P. Objects as points. arXiv preprint arXiv:1904.07850, 2019."},{"key":"247_CR90","doi-asserted-by":"crossref","unstructured":"Tian, Z.; Shen, C. H.; Chen, H.; He, T. FCOS: Fully convolutional one-stage object detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 9626\u20139635, 2019.","DOI":"10.1109\/ICCV.2019.00972"},{"key":"247_CR91","doi-asserted-by":"crossref","unstructured":"Stewart, R.; Andriluka, M.; Ng, A. Y. End-to-end people detection in crowded scenes. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2325\u20132333, 2016.","DOI":"10.1109\/CVPR.2016.255"},{"key":"247_CR92","doi-asserted-by":"crossref","unstructured":"Hosang, J.; Benenson, R.; Schiele, B. Learning non-maximum suppression. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 6469\u20136477, 2017.","DOI":"10.1109\/CVPR.2017.685"},{"key":"247_CR93","unstructured":"Rezatofighi, S. H.; Kaskman, R.; Motlagh, F. T.; Shi, Q. F.; Cremers, D.; Leal-Taix\u00e9, L.; Reid, I. Deep perm-set net: Learn to predict sets with unknown permutation and cardinality using deep neural networks. arXiv preprint arXiv:1805.00613, 2018."},{"key":"247_CR94","doi-asserted-by":"crossref","first-page":"6745","DOI":"10.1109\/TIP.2020.2993403","volume":"29","author":"X J Pan","year":"2020","unstructured":"Pan, X. J.; Tang, F.; Dong, W. M.; Gu, Y.; Song, Z. C.; Meng, Y. P.; Xu, P.; Deussen, O.; Xu, C. Self-supervised feature augmentation for large image object detection. IEEE Transactions on Image Processing Vol. 29, 6745\u20136758, 2020.","journal-title":"IEEE Transactions on Image Processing"},{"key":"247_CR95","doi-asserted-by":"crossref","unstructured":"Pan, X.; Gao, Y.; Lin, Z.; Tang, F.; Dong, W.; Yuan, H.; Huang, F.; Xu, C. Unveiling the potential of structure preserving for weakly supervised object localization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 11642\u201311651, 2021.","DOI":"10.1109\/CVPR46437.2021.01147"},{"key":"247_CR96","doi-asserted-by":"crossref","unstructured":"Pan, X. J.; Ren, Y. Q.; Sheng, K. K.; Dong, W. M.; Yuan, H. L.; Guo, X. W.; Ma, C.; Xu, C. Dynamic refinement network for oriented and densely packed object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 11204\u201311213, 2020.","DOI":"10.1109\/CVPR42600.2020.01122"},{"key":"247_CR97","unstructured":"Chu, X. X.; Tian, Z.; Wang, Y. Q.; Zhang, B.; Shen, C. H. Twins: Revisiting spatial attention design in vision transformers. arXiv preprint arXiv:2104.13840, 2021."},{"key":"247_CR98","unstructured":"Beal, J.; Kim, E.; Tzeng, E.; Park, D. H.; Kislyuk, D. Toward transformer-based object detection. arXiv preprint arXiv:2012.09958, 2020."},{"key":"247_CR99","doi-asserted-by":"crossref","unstructured":"Dai, J. F.; Qi, H. Z.; Xiong, Y. W.; Li, Y.; Zhang, G. D.; Hu, H.; Wei, Y. Deformable convolutional networks. In: Proceedings of the IEEE International Conference on Computer Vision, 764\u2013773, 2017.","DOI":"10.1109\/ICCV.2017.89"},{"key":"247_CR100","doi-asserted-by":"crossref","unstructured":"He, K. M.; Gkioxari, G.; Dollar, P.; Girshick, R. Mask R-CNN. In: Proceedings of the IEEE International Conference on Computer Vision, 2980\u20132988, 2017.","DOI":"10.1109\/ICCV.2017.322"},{"key":"247_CR101","doi-asserted-by":"crossref","unstructured":"Chen, H.; Wang, Y.; Guo, T.; Xu, C.; Deng, Y.; Liu, Z.; Ma, S.; Xu, C.; Xu, C.; Gao, W. Pre-trained image processing transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 12299\u201312310, 2021.","DOI":"10.1109\/CVPR46437.2021.01212"},{"key":"247_CR102","unstructured":"Esser, P.; Rombach, R.; Ommer, B. Taming transformers for high-resolution image synthesis. arXiv preprint arXiv:2012.09841, 2020."},{"key":"247_CR103","unstructured":"Kaiser, L.; Bengio, S. Can active memory replace attention? In: Proceedings of the 30th International Conference on Neural Information Processing Systems, 3781\u20133789, 2016."},{"issue":"4","key":"247_CR104","doi-asserted-by":"publisher","first-page":"652","DOI":"10.1109\/TPAMI.2016.2587640","volume":"39","author":"O Vinyals","year":"2016","unstructured":"Vinyals, O.; Toshev, A.; Bengio, S.; Erhan, D. Show and tell: Lessons learned from the 2015 MSCOCO image captioning challenge. IEEE Transactions on Pattern Analysis and Machine Intelligence Vol. 39, No. 4, 652\u2013663, 2016.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"247_CR105","doi-asserted-by":"crossref","unstructured":"Vinyals, O.; Toshev, A.; Bengio, S.; Erhan, D. Show and tell: A neural image caption generator. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 3156\u20133164, 2015.","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"247_CR106","unstructured":"Rolfe, J. T. Discrete variational autoencoders. arXiv preprint arXiv:1609.02200, 2016."},{"key":"247_CR107","unstructured":"Goodfellow, I. J.; Pouget-Abadie, J.; Mirza, M.; Xu, B.; Warde-Farley, D.; Ozair, S.; Courville, A.; Bengio, Y. Generative adversarial networks. arXiv preprint arXiv:1406.2661, 2014."},{"key":"247_CR108","unstructured":"Ho, J.; Kalchbrenner, N.; Weissenborn, D.; Salimans, T. Axial attention in multidimensional transformers. arXiv preprint arXiv:1912.12180, 2019."},{"key":"247_CR109","doi-asserted-by":"crossref","unstructured":"Antol, S.; Agrawal, A.; Lu, J. S.; Mitchell, M.; Batra, D.; Zitnick, C. L.; Parikh, D. VQA: Visual question answering. In: Proceedings of the IEEE International Conference on Computer Vision, 2425\u20132433, 2015.","DOI":"10.1109\/ICCV.2015.279"},{"key":"247_CR110","doi-asserted-by":"crossref","unstructured":"Goyal, Y.; Khot, T.; Summers-Stay, D.; Batra, D.; Parikh, D. Making the V in VQA matter: Elevating the role of image understanding in visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 6325\u20136334, 2017.","DOI":"10.1109\/CVPR.2017.670"},{"key":"247_CR111","unstructured":"Chen, X. L.; Fang, H.; Lin, T.-Y.; Vedantam, R.; Gupta, S.; Dollar, P.; Zitnick, C. L. Microsoft COCO captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325, 2015."},{"key":"247_CR112","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young, P.; Lai, A.; Hodosh, M.; Hockenmaier, J. From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions. Transactions of the Association for Computational Linguistics Vol. 2, 67\u201378, 2014.","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"247_CR113","unstructured":"Gan, Z.; Chen, Y.-C.; Li, L.; Zhu, C.; Cheng, Y.; Liu, J. Large-scale adversarial training for vision-and-language representation learning. In: Advances in Neural Information Processing Systems, Vol. 33. Larochelle, H.; Ranzato, M.; Hadsell, R.; Balcan, M. F.; Lin, H. Eds. Curran Associates, Inc., 6616\u20136628, 2020."},{"key":"247_CR114","unstructured":"Lin, J. Y.; Yang, A.; Zhang, Y. C.; Liu, J.; Yang, H. X. InterBERT: Vision-and-language interaction for multi-modal pretraining. arXiv preprint arXiv:2003.13198, 2020."},{"key":"247_CR115","unstructured":"Su, W.; Zhu, X.; Cao, Y.; Li, B.; Lu, L.; Wei, F.; Dai, J. VL-BERT: Pre-training of generic visual-linguistic representations. In: Proceedings of the International Conference on Learning Representations, 2020."},{"issue":"7","key":"247_CR116","doi-asserted-by":"publisher","first-page":"13041","DOI":"10.1609\/aaai.v34i07.7005","volume":"34","author":"L W Zhou","year":"2020","unstructured":"Zhou, L. W.; Palangi, H.; Zhang, L.; Hu, H. D.; Corso, J.; Gao, J. F. Unified vision-language pre-training for image captioning and VQA. In: Proceedings of the AAAI Conference on Artificial Intelligence, Vol. 34, No. 7, 13041\u201313049, 2020.","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"key":"247_CR117","doi-asserted-by":"crossref","unstructured":"Girshick, R. Fast R-CNN. In: Proceedings of the IEEE International Conference on Computer Vision, 1440\u20131448, 2015.","DOI":"10.1109\/ICCV.2015.169"},{"key":"247_CR118","doi-asserted-by":"crossref","unstructured":"Li, W.; Gao, C.; Niu, G. C.; Xiao, X. Y.; Wang, H. F. UNIMO: Towards unified-modal understanding and generation via cross-modal contrastive learning. arXiv preprint arXiv:2012.15409, 2020.","DOI":"10.18653\/v1\/2021.acl-long.202"},{"key":"247_CR119","unstructured":"Li, L. H.; Yatskar, M.; Yin, D.; Hsieh, C. J.; Chang, K. W. VisualBERT: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557, 2019."},{"key":"247_CR120","doi-asserted-by":"crossref","unstructured":"Alberti, C.; Ling, J.; Collins, M.; Reitter, D. Fusion of detected objects in text for visual question answering. In: Proceedings of the Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing, 2131\u20132140, 2019.","DOI":"10.18653\/v1\/D19-1219"},{"key":"247_CR121","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"121","DOI":"10.1007\/978-3-030-58577-8_8","volume-title":"Computer Vision-ECCV 2020","author":"X J Li","year":"2020","unstructured":"Li, X. J.; Yin, X.; Li, C. Y.; Zhang, P. C.; Hu, X. W.; Zhang, L.; Wang, L.; Hu, H.; Dong, L.; Wei, F. et al. OSCAR: Object-semantics aligned pre-training for vision-language tasks. In: Computer Vision-ECCV 2020. Lecture Notes in Computer Science, Vol. 12375. Vedaldi, A.; Bischof, H.; Brox, T.; Frahm, J. M. Eds. Springer Cham, 121\u2013137, 2020."},{"key":"247_CR122","unstructured":"Yu, F.; Tang, J.; Yin, W.; Sun, Y.; Tian, H.; Wu, H.; Wang, H. ERNIE-ViL: Knowledge enhanced vision-language representations through scene graph. In: Proceedings of the AAAI Conference on Artificial Intelligence, 2021."},{"key":"247_CR123","doi-asserted-by":"crossref","unstructured":"Li, Y.; Pan, Y.; Yao, T.; Chen, J.; Mei, T. Scheduled sampling in vision-language pretraining with decoupled encoder-decoder network. In: Proceedings of the AAAI Conference on Artificial Intelligence, 8518\u20138526, 2021.","DOI":"10.1609\/aaai.v35i10.17034"},{"key":"247_CR124","doi-asserted-by":"crossref","unstructured":"Tan, H.; Bansal, M. LXMERT: Learning cross-modality encoder representations from transformers. In: Proceedings of the Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing, 5100\u20135111, 2019.","DOI":"10.18653\/v1\/D19-1514"},{"key":"247_CR125","doi-asserted-by":"crossref","unstructured":"Sharma, P.; Ding, N.; Goodman, S.; Soricut, R. Conceptual captions: A cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics, 2556\u20132565, 2018.","DOI":"10.18653\/v1\/P18-1238"},{"key":"247_CR126","unstructured":"Ordonez, V.; Kulkarni, G.; Berg, T. L. Im2Text: Describing images using 1 million captioned photographs. In: Proceedings of the 24th International Conference on Neural Information Processing Systems, 1143\u20131151, 2011."},{"issue":"1","key":"247_CR127","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R.; Zhu, Y. K.; Groth, O.; Johnson, J.; Hata, K. J.; Kravitz, J.; Chen, S.; Kalantidis, Y.; Li, L.-J.; Shamma, D. A. et al. Visual genome: Connecting language and vision using crowdsourced dense image annotations. International Journal of Computer Vision Vol. 123, No. 1, 32\u201373, 2017.","journal-title":"International Journal of Computer Vision"},{"key":"247_CR128","doi-asserted-by":"crossref","unstructured":"Hudson, D. A.; Manning, C. D. GQA: A new dataset for real-world visual reasoning and compositional question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 6693\u20136702, 2019.","DOI":"10.1109\/CVPR.2019.00686"},{"key":"247_CR129","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision-ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y.; Maire, M.; Belongie, S.; Hays, J.; Perona, P.; Ramanan, D., Dollar, P.; Zitnick, C. L. Microsoft COCO: Common objects in context. In: Computer Vision-ECCV 2014. Lecture Notes in Computer Science, Vol. 8693. Fleet, D.; Pajdla, T.; Schiele, B.; Tuytelaars, T. Eds. Springer Cham, 740\u2013755, 2014."},{"issue":"7","key":"247_CR130","doi-asserted-by":"publisher","first-page":"1956","DOI":"10.1007\/s11263-020-01316-z","volume":"128","author":"A Kuznetsova","year":"2020","unstructured":"Kuznetsova, A.; Rom, H.; Alldrin, N.; Uijlings, J.; Krasin, I.; Pont-Tuset, J.; Kamali, S.; Popov, S.; Malloci, M.; Kolesnikov, A. et al. The open images dataset V4. International Journal of Computer Vision Vol. 128, No. 7, 1956\u20131981, 2020.","journal-title":"International Journal of Computer Vision"},{"key":"247_CR131","unstructured":"Zhang, P.; Li, X.; Hu, X.; Yang, J.; Zhang, L.; Wang, L.; Choi, Y.; Gao, J. VinVL: Revisiting visual representations in vision-language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 5579\u20135588, 2021."},{"key":"247_CR132","unstructured":"Hu, R.; Singh, A. UniT: Multimodal multitask learning with a unified transformer. arXiv preprint arXiv:2102.10772, 2021."},{"key":"247_CR133","doi-asserted-by":"crossref","unstructured":"Suhr, A.; Zhou, S.; Zhang, A.; Zhang, I.; Bai, H. J.; Artzi, Y. A corpus for reasoning about natural language grounded in photographs. In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, 6418\u20136428, 2019.","DOI":"10.18653\/v1\/P19-1644"},{"key":"247_CR134","unstructured":"Xie, N.; Lai, F.; Doran, D.; Kadav, A. Visual entailment: A novel task for fine-grained image understanding. arXiv preprint arXiv:1901.06706, 2019."},{"key":"247_CR135","doi-asserted-by":"crossref","unstructured":"Zellers, R.; Bisk, Y.; Farhadi, A.; Choi, Y. From recognition to cognition: Visual commonsense reasoning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 6713\u20136724, 2019.","DOI":"10.1109\/CVPR.2019.00688"},{"key":"247_CR136","doi-asserted-by":"crossref","unstructured":"Kazemzadeh, S.; Ordonez, V.; Matten, M.; Berg, T. ReferItGame: Referring to objects in photographs of natural scenes. In: Proceedings of the Conference on Empirical Methods in Natural Language Processing, 787\u2013798, 2014.","DOI":"10.3115\/v1\/D14-1086"},{"key":"247_CR137","doi-asserted-by":"crossref","unstructured":"Sheng, K. K.; Dong, W. M.; Ma, C. Y.; Mei, X.; Huang, F. Y.; Hu, B.-G. Attention-based multi-patch aggregation for image aesthetic assessment. In: Proceedings of the 26th ACM International Conference on Multimedia, 879\u2013886, 2018.","DOI":"10.1145\/3240508.3240554"},{"issue":"4","key":"247_CR138","doi-asserted-by":"publisher","first-page":"5709","DOI":"10.1609\/aaai.v34i04.6026","volume":"34","author":"K K Sheng","year":"2020","unstructured":"Sheng, K. K.; Dong, W. M.; Chai, M. L.; Wang, G. H.; Zhou, P.; Huang, F. Y.; Hu, B.-G.; Ji, R.; Ma, C. Revisiting image aesthetic assessment via self-supervised feature learning. In: Proceedings of the AAAI Conference on Artificial Intelligence, Vol. 34, No. 4, 5709\u20135716, 2020.","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"issue":"1","key":"247_CR139","doi-asserted-by":"publisher","first-page":"139","DOI":"10.1007\/s41095-020-0193-5","volume":"7","author":"K K Sheng","year":"2021","unstructured":"Sheng, K. K.; Dong, W. M.; Huang, H. B.; Chai, M. L.; Zhang, Y.; Ma, C. Y.; Hu, B.-G. Learning to assess visual aesthetics of food images. Computational Visual Media Vol. 7, No. 1, 139\u2013152, 2021.","journal-title":"Computational Visual Media"},{"key":"247_CR140","doi-asserted-by":"crossref","unstructured":"Zhang, S. F.; Wang, X. B.; Liu, A.; Zhao, C. X.; Wan, J.; Escalera, S.; Shi, H.; Wang, Z.; Li, S. Z. A dataset and benchmark for large-scale multi-modal face anti-spoofing. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 919\u2013928, 2019.","DOI":"10.1109\/CVPR.2019.00101"},{"key":"247_CR141","doi-asserted-by":"crossref","unstructured":"Chen, Z.; Yao, T.; Sheng, K.; Ding, S.; Tai, Y.; Li, J.; Huang, F.; Jin, X. Generalizable representation learning for mixture domain face anti-spoofing. In: Proceedings of the AAAI Conference on Artificial Intelligence, 1132\u20131139, 2021.","DOI":"10.1609\/aaai.v35i2.16199"},{"key":"247_CR142","unstructured":"Zhao, H.; Jiang, L.; Jia, J.; Torr, P.; Koltun, V. Point transformer. arXiv preprint arXiv:2012.09164, 2020."},{"key":"247_CR143","unstructured":"Zoph, B.; Le, Q. V. Neural architecture search with reinforcement learning. In: Proceedings of the International Conference on Learning Representations, 2017."},{"key":"247_CR144","doi-asserted-by":"crossref","unstructured":"Zoph, B.; Vasudevan, V.; Shlens, J.; Le, Q. V. Learning transferable architectures for scalable image recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 8697\u20138710, 2018.","DOI":"10.1109\/CVPR.2018.00907"},{"key":"247_CR145","doi-asserted-by":"publisher","first-page":"4780","DOI":"10.1609\/aaai.v33i01.33014780","volume":"33","author":"E Real","year":"2019","unstructured":"Real, E.; Aggarwal, A.; Huang, Y. P.; Le, Q. V. Regularized evolution for image classifier architecture search. In: Proceedings of the AAAI Conference on Artificial Intelligence, Vol. 33, 4780\u20134789, 2019.","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"key":"247_CR146","doi-asserted-by":"crossref","unstructured":"Wang, H. R.; Wu, Z. H.; Liu, Z. J.; Cai, H.; Zhu, L. G.; Gan, C.; Han, S. HAT: Hardware-aware transformers for efficient natural language processing. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, 7675\u20137688, 2020.","DOI":"10.18653\/v1\/2020.acl-main.686"},{"key":"247_CR147","unstructured":"So, D.; Le, Q.; Liang, C. The evolved transformer. In: Proceedings of the 36th International Conference on Machine Learning, 5877\u20135886, 2019."},{"key":"247_CR148","doi-asserted-by":"crossref","unstructured":"Li, C. L.; Tang, T.; Wang, G. R.; Peng, J. F.; Chang, X. J. BossNAS: Exploring hybrid CNN-transformers with Block-wisely Self-supervised neural architecture search. arXiv preprint arXiv:2103.12424, 2021.","DOI":"10.1109\/CVPR42600.2020.00206"},{"key":"247_CR149","unstructured":"Schulz, K.; Sixt, L.; Tombari, F.; Landgraf, T. Restricting the flow: Information bottlenecks for attribution. In: Proceedings of the International Conference on Learning Representations, 2019."},{"key":"247_CR150","doi-asserted-by":"crossref","unstructured":"Jiang, Z.; Tang, R.; Xin, J.; Lin, J. Inserting information bottleneck for attribution in transformers. In: Proceedings of the Conference on Empirical Methods in Natural Language Processing: Findings, 3850\u20133857, 2020.","DOI":"10.18653\/v1\/2020.findings-emnlp.343"}],"container-title":["Computational Visual Media"],"original-title":[],"link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s41095-021-0247-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s41095-021-0247-3\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s41095-021-0247-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"},{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10750449\/10897562\/10897566.pdf?arnumber=10897566","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,5]],"date-time":"2025-11-05T18:38:39Z","timestamp":1762367919000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10897566\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,3]]},"references-count":150,"journal-issue":{"issue":"1"},"URL":"https:\/\/doi.org\/10.1007\/s41095-021-0247-3","relation":{},"ISSN":["2096-0662","2096-0433"],"issn-type":[{"value":"2096-0662","type":"electronic"},{"value":"2096-0433","type":"print"}],"subject":[],"published":{"date-parts":[[2022,3]]},"assertion":[{"value":"17 June 2021","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"16 July 2021","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 October 2021","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}