{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,28]],"date-time":"2026-04-28T14:59:44Z","timestamp":1777388384456,"version":"3.51.4"},"publisher-location":"Cham","reference-count":90,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031200731","type":"print"},{"value":"9783031200748","type":"electronic"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-20074-8_16","type":"book-chapter","created":{"date-parts":[[2022,11,11]],"date-time":"2022-11-11T20:23:11Z","timestamp":1668198191000},"page":"271-289","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":11,"title":["Exploring Fine-Grained Audiovisual Categorization with\u00a0the\u00a0SSW60 Dataset"],"prefix":"10.1007","author":[{"given":"Grant","family":"Van Horn","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rui","family":"Qian","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kimberly","family":"Wilber","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hartwig","family":"Adam","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Oisin","family":"Mac Aodha","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Serge","family":"Belongie","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2022,11,12]]},"reference":[{"key":"16_CR1","unstructured":"iNaturalist 2021 Challenge. http:\/\/www.kaggle.com\/c\/inaturalist-2021. Accessed 7 Mar 2022"},{"key":"16_CR2","unstructured":"Macaulay Library. http:\/\/www.macaulaylibrary.org. Accessed 7 Mar 2022"},{"key":"16_CR3","unstructured":"Merlin Sound ID. http:\/\/merlin.allaboutbirds.org\/sound-id. Accessed 7 Mar 2022"},{"key":"16_CR4","doi-asserted-by":"crossref","unstructured":"Alsahafi, Y., Lemmond, D., Ventura, J., Boult, T.: Carvideos: a novel dataset for fine-grained car classification in videos. In: International Conference on Information Technology-New Generations (2019)","DOI":"10.1007\/978-3-030-14070-0_63"},{"key":"16_CR5","doi-asserted-by":"crossref","unstructured":"Arandjelovic, R., Zisserman, A.: Look, listen and learn. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.73"},{"key":"16_CR6","doi-asserted-by":"crossref","unstructured":"Aytar, Y., Vondrick, C., Torralba, A.: Soundnet: learning sound representations from unlabeled video. In: NeurIPS (2016)","DOI":"10.1109\/CVPR.2016.18"},{"key":"16_CR7","doi-asserted-by":"crossref","unstructured":"Baker, E., Vincent, S.: A deafening silence: a lack of data and reproducibility in published bioacoustics research? Biodivers. Data J. (2019)","DOI":"10.3897\/BDJ.7.e36783"},{"issue":"8","key":"16_CR8","doi-asserted-by":"publisher","first-page":"2939","DOI":"10.1007\/s00371-021-02166-7","volume":"38","author":"K Bayoudh","year":"2021","unstructured":"Bayoudh, K., Knani, R., Hamdaoui, F., Mtibaa, A.: A survey on deep multimodal learning for computer vision: advances, trends, applications, and datasets. Vis. Comput. 38(8), 2939\u20132970 (2021)","journal-title":"Vis. Comput."},{"key":"16_CR9","doi-asserted-by":"crossref","unstructured":"Berg, T., Liu, J., Lee, S.W., Alexander, M.L., Jacobs, D.W., Belhumeur, P.N.: Birdsnap: large-scale fine-grained visual categorization of birds. In: CVPR (2014)","DOI":"10.1109\/CVPR.2014.259"},{"key":"16_CR10","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"446","DOI":"10.1007\/978-3-319-10599-4_29","volume-title":"Computer Vision \u2013 ECCV 2014","author":"L Bossard","year":"2014","unstructured":"Bossard, L., Guillaumin, M., Van Gool, L.: Food-101 \u2013 mining discriminative components with random forests. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8694, pp. 446\u2013461. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10599-4_29"},{"key":"16_CR11","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? A new model and the kinetics dataset. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"16_CR12","doi-asserted-by":"crossref","unstructured":"Chen, H., Xie, W., Afouras, T., Nagrani, A., Vedaldi, A., Zisserman, A.: Localizing visual sounds the hard way. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.01659"},{"key":"16_CR13","doi-asserted-by":"crossref","unstructured":"Chen, H., Xie, W., Vedaldi, A., Zisserman, A.: Vggsound: a large-scale audio-visual dataset. In: International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2020)","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"issue":"5","key":"16_CR14","doi-asserted-by":"publisher","first-page":"837","DOI":"10.1109\/5.664274","volume":"86","author":"T Chen","year":"1998","unstructured":"Chen, T., Rao, R.R.: Audio-visual integration in multimodal communication. Proc. IEEE 86(5), 837\u2013852 (1998)","journal-title":"Proc. IEEE"},{"key":"16_CR15","doi-asserted-by":"crossref","unstructured":"Chronister, L., Rhinehart, T., Place, A., Kitzes, J.: An annotated set of audio recordings of Eastern North American birds containing frequency, time, and species information. Ecology (2021)","DOI":"10.1002\/ecy.3329"},{"key":"16_CR16","doi-asserted-by":"crossref","unstructured":"Cramer, J., Lostanlen, V., Farnsworth, A., Salamon, J., Bello, J.P.: Chirping up the right tree: incorporating biological taxonomies into deep bioacoustic classifiers. In: International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2020)","DOI":"10.1109\/ICASSP40776.2020.9052908"},{"key":"16_CR17","doi-asserted-by":"crossref","unstructured":"Damen, D., et al.: Scaling egocentric vision: the epic-kitchens dataset. In: ECCV (2018)","DOI":"10.1007\/978-3-030-01225-0_44"},{"key":"16_CR18","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. In: ICLR (2021)"},{"key":"16_CR19","doi-asserted-by":"crossref","unstructured":"Everingham, M., Van Gool, L., Williams, C.K., Winn, J., Zisserman, A.: The pascal visual object classes (VOC) challenge. In: IJCV (2010)","DOI":"10.1007\/s11263-009-0275-4"},{"key":"16_CR20","doi-asserted-by":"crossref","unstructured":"Fayek, H.M., Kumar, A.: Large scale audiovisual learning of sounds with weakly labeled data. arXiv:2006.01595 (2020)","DOI":"10.24963\/ijcai.2020\/78"},{"key":"16_CR21","unstructured":"Fonseca, E., Favory, X., Pons, J., Font, F., Serra, X.: FSD50K: an open dataset of human-labeled sound events. arXiv:2010.00475 (2020)"},{"key":"16_CR22","unstructured":"Garofolo, J.S.: Timit acoustic phonetic continuous speech corpus. Linguistic Data Consortium (1993)"},{"key":"16_CR23","doi-asserted-by":"crossref","unstructured":"Ge, Z., et al.: Exploiting temporal information for DCNN-based fine-grained object classification. In: International Conference on Digital Image Computing: Techniques and Applications (2016)","DOI":"10.1109\/DICTA.2016.7797039"},{"key":"16_CR24","doi-asserted-by":"crossref","unstructured":"Gebru, T., Krause, J., Wang, Y., Chen, D., Deng, J., Fei-Fei, L.: Fine-grained car detection for visual census estimation. In: AAAI (2017)","DOI":"10.1609\/aaai.v31i1.11174"},{"key":"16_CR25","doi-asserted-by":"crossref","unstructured":"Gemmeke, J.F., et al.: Audio set: an ontology and human-labeled dataset for audio events. In: International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2017)","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"16_CR26","doi-asserted-by":"crossref","unstructured":"Gong, Y., Chung, Y.A., Glass, J.: AST: audio spectrogram transformer. In: Interspeech (2021)","DOI":"10.21437\/Interspeech.2021-698"},{"key":"16_CR27","doi-asserted-by":"crossref","unstructured":"Goyal, R., et al.: The \u201csomething something\u201d video database for learning and evaluating visual common sense. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.622"},{"key":"16_CR28","doi-asserted-by":"crossref","unstructured":"Gupta, A., Dollar, P., Girshick, R.: LVIS: a dataset for large vocabulary instance segmentation. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00550"},{"key":"16_CR29","unstructured":"He, J., et al.: TransFG: a transformer architecture for fine-grained recognition. arXiv:2103.07976 (2021)"},{"key":"16_CR30","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"16_CR31","doi-asserted-by":"crossref","unstructured":"He, X., Peng, Y., Xie, L.: A new benchmark and approach for fine-grained cross-media retrieval. In: International Conference on Multimedia (2019)","DOI":"10.1145\/3343031.3350974"},{"key":"16_CR32","doi-asserted-by":"crossref","unstructured":"Hershey, S., et al.: CNN architectures for large-scale audio classification. In: ICASSP (2017)","DOI":"10.1109\/ICASSP.2017.7952132"},{"key":"16_CR33","doi-asserted-by":"crossref","unstructured":"Hou, S., Feng, Y., Wang, Z.: VegFru: a domain-specific dataset for fine-grained visual categorization. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.66"},{"key":"16_CR34","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"316","DOI":"10.1007\/978-3-030-58452-8_19","volume-title":"Computer Vision \u2013 ECCV 2020","author":"M Jia","year":"2020","unstructured":"Jia, M., et al.: Fashionpedia: ontology, segmentation, and an attribute localization dataset. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 316\u2013332. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_19"},{"issue":"11","key":"16_CR35","doi-asserted-by":"publisher","first-page":"2327","DOI":"10.1109\/TPAMI.2016.2551239","volume":"38","author":"V Kalogeiton","year":"2016","unstructured":"Kalogeiton, V., Ferrari, V., Schmid, C.: Analysing domain shift factors between videos and images for object detection. PAMI 38(11), 2327\u20132334 (2016)","journal-title":"PAMI"},{"key":"16_CR36","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Toderici, G., Shetty, S., Leung, T., Sukthankar, R., Fei-Fei, L.: Large-scale video classification with convolutional neural networks. In: CVPR (2014)","DOI":"10.1109\/CVPR.2014.223"},{"key":"16_CR37","unstructured":"Kay, W., et al.: The kinetics human action video dataset. arXiv:1705.06950 (2017)"},{"key":"16_CR38","doi-asserted-by":"crossref","unstructured":"Kazakos, E., Nagrani, A., Zisserman, A., Damen, D.: Epic-fusion: audio-visual temporal binding for egocentric action recognition. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00559"},{"key":"16_CR39","unstructured":"Khosla, A., Jayadevaprakash, N., Yao, B., Fei-Fei, L.: Novel dataset for fine-grained image categorization. In: First Workshop on Fine-Grained Visual Categorization (2011)"},{"key":"16_CR40","unstructured":"Krasin, I., et al.: Openimages: a public dataset for large-scale multi-label and multi-class image classification (2017). http:\/\/storage.googleapis.com\/openimages\/web\/index.html"},{"key":"16_CR41","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"301","DOI":"10.1007\/978-3-319-46487-9_19","volume-title":"Computer Vision \u2013 ECCV 2016","author":"J Krause","year":"2016","unstructured":"Krause, J., et al.: The unreasonable effectiveness of noisy data for fine-grained recognition. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9907, pp. 301\u2013320. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46487-9_19"},{"key":"16_CR42","doi-asserted-by":"crossref","unstructured":"Krause, J., Stark, M., Deng, J., Fei-Fei, L.: 3D object representations for fine-grained categorization. In: ICCV Workshops (2013)","DOI":"10.1109\/ICCVW.2013.77"},{"key":"16_CR43","doi-asserted-by":"crossref","unstructured":"Kuehne, H., Jhuang, H., Garrote, E., Poggio, T., Serre, T.: HMDB: a large video database for human motion recognition. In: ICCV (2011)","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"16_CR44","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"502","DOI":"10.1007\/978-3-642-33709-3_36","volume-title":"Computer Vision \u2013 ECCV 2012","author":"N Kumar","year":"2012","unstructured":"Kumar, N., et al.: Leafsnap: a computer vision system for automatic plant species identification. In: Fitzgibbon, A., Lazebnik, S., Perona, P., Sato, Y., Schmid, C. (eds.) ECCV 2012. LNCS, vol. 7573, pp. 502\u2013516. Springer, Heidelberg (2012). https:\/\/doi.org\/10.1007\/978-3-642-33709-3_36"},{"key":"16_CR45","doi-asserted-by":"crossref","unstructured":"Lee, S., et al.: ACAV100M: automatic curation of large-scale datasets for audio-visual video representation learning. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.01011"},{"key":"16_CR46","doi-asserted-by":"crossref","unstructured":"Li, G., Wei, Y., Tian, Y., Xu, C., Wen, J.R., Hu, D.: Learning to answer questions in dynamic audio-visual scenarios. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19108\u201319118 (2022)","DOI":"10.1109\/CVPR52688.2022.01852"},{"key":"16_CR47","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"520","DOI":"10.1007\/978-3-030-01231-1_32","volume-title":"Computer Vision \u2013 ECCV 2018","author":"Y Li","year":"2018","unstructured":"Li, Y., Li, Y., Vasconcelos, N.: RESOUND: towards action recognition without representation bias. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11210, pp. 520\u2013535. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01231-1_32"},{"key":"16_CR48","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"16_CR49","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"466","DOI":"10.1007\/978-3-319-10593-2_31","volume-title":"Computer Vision \u2013 ECCV 2014","author":"Y-L Lin","year":"2014","unstructured":"Lin, Y.-L., Morariu, V.I., Hsu, W., Davis, L.S.: Jointly optimizing 3D model fitting and fine-grained classification. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8692, pp. 466\u2013480. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10593-2_31"},{"key":"16_CR50","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"172","DOI":"10.1007\/978-3-642-33718-5_13","volume-title":"Computer Vision \u2013 ECCV 2012","author":"J Liu","year":"2012","unstructured":"Liu, J., Kanazawa, A., Jacobs, D., Belhumeur, P.: Dog breed classification using part localization. In: Fitzgibbon, A., Lazebnik, S., Perona, P., Sato, Y., Schmid, C. (eds.) ECCV 2012. LNCS, vol. 7572, pp. 172\u2013185. Springer, Heidelberg (2012). https:\/\/doi.org\/10.1007\/978-3-642-33718-5_13"},{"key":"16_CR51","doi-asserted-by":"crossref","unstructured":"Lostanlen, V., Salamon, J., Farnsworth, A., Kelling, S., Bello, J.P.: Birdvox-full-night: a dataset and benchmark for avian flight call detection. In: International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2018)","DOI":"10.1109\/ICASSP.2018.8461410"},{"key":"16_CR52","doi-asserted-by":"crossref","unstructured":"Mac Aodha, O., et al.: Bat detective-deep learning tools for bat acoustic signal detection. PLoS Comput. Biol. 14(3), e1005995 (2018)","DOI":"10.1371\/journal.pcbi.1005995"},{"key":"16_CR53","unstructured":"Maji, S., Rahtu, E., Kannala, J., Blaschko, M., Vedaldi, A.: Fine-grained visual classification of aircraft. arXiv:1306.5151 (2013)"},{"issue":"2","key":"16_CR54","doi-asserted-by":"publisher","first-page":"502","DOI":"10.1109\/TPAMI.2019.2901464","volume":"42","author":"M Monfort","year":"2019","unstructured":"Monfort, M., et al.: Moments in time dataset: one million videos for event understanding. PAMI 42(2), 502\u2013508 (2019)","journal-title":"PAMI"},{"key":"16_CR55","doi-asserted-by":"crossref","unstructured":"Morfi, V., Bas, Y., Pamu\u0142a, H., Glotin, H., Stowell, D.: NIPS4Bplus: a richly annotated birdsong audio dataset. PeerJ Comput. Sci. 5, e223 (2019)","DOI":"10.7717\/peerj-cs.223"},{"key":"16_CR56","unstructured":"Nagrani, A., Yang, S., Arnab, A., Jansen, A., Schmid, C., Sun, C.: Attention bottlenecks for multimodal fusion. In: NeurIPS (2021)"},{"key":"16_CR57","unstructured":"Nilsback, M.E., Zisserman, A.: A visual vocabulary for flower classification. In: CVPR (2006)"},{"key":"16_CR58","doi-asserted-by":"crossref","unstructured":"Nilsback, M.E., Zisserman, A.: Automated flower classification over a large number of classes. In: Indian Conference on Computer Vision, Graphics & Image Processing (2008)","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"16_CR59","doi-asserted-by":"crossref","unstructured":"Park, D.S., et al.: SpecAugment: a simple data augmentation method for automatic speech recognition. In: Interspeech (2019)","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"16_CR60","doi-asserted-by":"crossref","unstructured":"Parkhi, O.M., Vedaldi, A., Zisserman, A., Jawahar, C.V.: Cats and dogs. In: CVPR (2012)","DOI":"10.1109\/CVPR.2012.6248092"},{"key":"16_CR61","doi-asserted-by":"crossref","unstructured":"Piczak, K.J.: ESC: dataset for environmental sound classification. In: International Conference on Multimedia (2015)","DOI":"10.1145\/2733373.2806390"},{"key":"16_CR62","doi-asserted-by":"crossref","unstructured":"Piergiovanni, A., Ryoo, M.S.: Fine-grained activity recognition in baseball videos. In: CVPR Workshops (2018)","DOI":"10.1109\/CVPRW.2018.00226"},{"key":"16_CR63","unstructured":"Robinson, T., Fransen, J., Pye, D., Foote, J., Renals, S.: WSJCAMO: a British English speech corpus for large vocabulary continuous speech recognition. In: International Conference on Acoustics, Speech, and Signal Processing (1995)"},{"issue":"12","key":"16_CR64","doi-asserted-by":"publisher","first-page":"2432","DOI":"10.1111\/2041-210X.13721","volume":"12","author":"C Roemer","year":"2021","unstructured":"Roemer, C., Julien, J.F., Bas, Y.: An automatic classifier of bat sonotypes around the world. Methods Ecol. Evol. 12(12), 2432\u20132444 (2021)","journal-title":"Methods Ecol. Evol."},{"key":"16_CR65","doi-asserted-by":"crossref","unstructured":"Russakovsky, O., et al.: Imagenet large scale visual recognition challenge. In: IJCV (2015)","DOI":"10.1007\/s11263-015-0816-y"},{"key":"16_CR66","doi-asserted-by":"crossref","unstructured":"Saito, T., Kanezaki, A., Harada, T.: IBC127: video dataset for fine-grained bird classification. In: International Conference on Multimedia and Expo (2016)","DOI":"10.1109\/ICME.2016.7552915"},{"key":"16_CR67","doi-asserted-by":"crossref","unstructured":"Salamon, J., Jacoby, C., Bello, J.P.: A dataset and taxonomy for urban sound research. In: International Conference on Multimedia (2014)","DOI":"10.1145\/2647868.2655045"},{"key":"16_CR68","doi-asserted-by":"crossref","unstructured":"Sevilla-Lara, L., Zha, S., Yan, Z., Goswami, V., Feiszli, M., Torresani, L.: Only time can tell: discovering temporal data for temporal modeling. In: WACV (2021)","DOI":"10.1109\/WACV48630.2021.00058"},{"key":"16_CR69","doi-asserted-by":"crossref","unstructured":"Shao, D., Zhao, Y., Dai, B., Lin, D.: FineGym: a hierarchical video dataset for fine-grained action understanding. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00269"},{"key":"16_CR70","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. In: ICLR (2015)"},{"key":"16_CR71","unstructured":"Soomro, K., Zamir, A.R., Shah, M.: UCF101: a dataset of 101 human actions classes from videos in the wild. arXiv:1212.0402 (2012)"},{"issue":"3","key":"16_CR72","doi-asserted-by":"publisher","first-page":"368","DOI":"10.1111\/2041-210X.13103","volume":"10","author":"D Stowell","year":"2019","unstructured":"Stowell, D., Wood, M.D., Pamu\u0142a, H., Stylianou, Y., Glotin, H.: Automatic acoustic detection of birds through deep learning: the first bird audio detection challenge. Methods Ecol. Evol. 10(3), 368\u2013380 (2019)","journal-title":"Methods Ecol. Evol."},{"issue":"2","key":"16_CR73","doi-asserted-by":"publisher","first-page":"64","DOI":"10.1145\/2812802","volume":"59","author":"B Thomee","year":"2016","unstructured":"Thomee, B., et al.: YFCC100M: the new data in multimedia research. Commun. ACM 59(2), 64\u201373 (2016)","journal-title":"Commun. ACM"},{"key":"16_CR74","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"436","DOI":"10.1007\/978-3-030-58580-8_26","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Y Tian","year":"2020","unstructured":"Tian, Y., Li, D., Xu, C.: Unified multisensory perception: weakly-supervised audio-visual video\u00a0parsing. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12348, pp. 436\u2013454. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58580-8_26"},{"key":"16_CR75","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"252","DOI":"10.1007\/978-3-030-01216-8_16","volume-title":"Computer Vision \u2013 ECCV 2018","author":"Y Tian","year":"2018","unstructured":"Tian, Y., Shi, J., Li, B., Duan, Z., Xu, C.: Audio-visual event localization in unconstrained videos. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11206, pp. 252\u2013268. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01216-8_16"},{"key":"16_CR76","unstructured":"Touvron, H., Cord, M., Douze, M., Massa, F., Sablayrolles, A., J\u00e9gou, H.: Training data-efficient image transformers & distillation through attention. In: ICML (2021)"},{"key":"16_CR77","doi-asserted-by":"crossref","unstructured":"Van Horn, G., et al.: Building a bird recognition app and large scale dataset with citizen scientists: the fine print in fine-grained dataset collection. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298658"},{"key":"16_CR78","doi-asserted-by":"crossref","unstructured":"Van Horn, G., Cole, E., Beery, S., Wilber, K., Belongie, S., Mac Aodha, O.: Benchmarking representation learning for natural world image collections. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.01269"},{"key":"16_CR79","doi-asserted-by":"crossref","unstructured":"Van Horn, G., et al.: The inaturalist species classification and detection dataset. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00914"},{"key":"16_CR80","doi-asserted-by":"crossref","unstructured":"Vedaldi, A., et al.: Understanding objects in detail with fine-grained attributes. In: CVPR (2014)","DOI":"10.1109\/CVPR.2014.463"},{"key":"16_CR81","unstructured":"Wah, C., Branson, S., Welinder, P., Perona, P., Belongie, S.: The caltech-UCSD birds-200-2011 dataset. Technical report, CNS-TR-2011-001 (2011)"},{"issue":"11","key":"16_CR82","doi-asserted-by":"publisher","first-page":"2740","DOI":"10.1109\/TPAMI.2018.2868668","volume":"41","author":"L Wang","year":"2018","unstructured":"Wang, L., et al.: Temporal segment networks for action recognition in videos. PAMI 41(11), 2740\u20132755 (2018)","journal-title":"PAMI"},{"key":"16_CR83","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"322","DOI":"10.1007\/978-3-030-58577-8_20","volume-title":"Computer Vision \u2013 ECCV 2020","author":"P Wu","year":"2020","unstructured":"Wu, P., et al.: Not only look, but also listen: learning multimodal violence detection under weak supervision. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12375, pp. 322\u2013339. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_20"},{"key":"16_CR84","unstructured":"Xiao, F., Lee, Y.J., Grauman, K., Malik, J., Feichtenhofer, C.: Audiovisual slowfast networks for video recognition. arXiv:2001.08740 (2020)"},{"key":"16_CR85","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"318","DOI":"10.1007\/978-3-030-01267-0_19","volume-title":"Computer Vision \u2013 ECCV 2018","author":"S Xie","year":"2018","unstructured":"Xie, S., Sun, C., Huang, J., Tu, Z., Murphy, K.: Rethinking spatiotemporal feature learning: speed-accuracy trade-offs in video classification. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11219, pp. 318\u2013335. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01267-0_19"},{"key":"16_CR86","doi-asserted-by":"crossref","unstructured":"Yang, L., Luo, P., Change Loy, C., Tang, X.: A large-scale car dataset for fine-grained categorization and verification. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7299023"},{"key":"16_CR87","doi-asserted-by":"crossref","unstructured":"Yun, H., Yu, Y., Yang, W., Lee, K., Kim, G.: Pano-AVQA: grounded audio-visual question answering on 360deg videos. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2031\u20132041 (2021)","DOI":"10.1109\/ICCV48922.2021.00204"},{"issue":"9","key":"16_CR88","doi-asserted-by":"publisher","first-page":"1082","DOI":"10.1111\/2041-210X.12556","volume":"7","author":"V Zamora-Gutierrez","year":"2016","unstructured":"Zamora-Gutierrez, V., et al.: Acoustic identification of Mexican bats based on taxonomic and ecological constraints on call design. Methods Ecol. Evol. 7(9), 1082\u20131091 (2016)","journal-title":"Methods Ecol. Evol."},{"issue":"6","key":"16_CR89","doi-asserted-by":"publisher","first-page":"1452","DOI":"10.1109\/TPAMI.2017.2723009","volume":"40","author":"B Zhou","year":"2017","unstructured":"Zhou, B., Lapedriza, A., Khosla, A., Oliva, A., Torralba, A.: Places: a 10 million image database for scene recognition. PAMI 40(6), 1452\u20131464 (2017)","journal-title":"PAMI"},{"key":"16_CR90","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"139","DOI":"10.1007\/978-3-030-01228-1_9","volume-title":"Computer Vision \u2013 ECCV 2018","author":"C Zhu","year":"2018","unstructured":"Zhu, C., et al.: Fine-grained video categorization with redundancy reduction attention. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11209, pp. 139\u2013155. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01228-1_9"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-20074-8_16","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,11,11]],"date-time":"2022-11-11T20:28:41Z","timestamp":1668198521000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-20074-8_16"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031200731","9783031200748"],"references-count":90,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-20074-8_16","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"12 November 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}