{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,5]],"date-time":"2026-02-05T23:40:39Z","timestamp":1770334839746,"version":"3.49.0"},"publisher-location":"Cham","reference-count":52,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031733369","type":"print"},{"value":"9783031733376","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73337-6_20","type":"book-chapter","created":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T23:02:27Z","timestamp":1730329347000},"page":"348-366","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["Efficient Image Pre-training with\u00a0Siamese Cropped Masked Autoencoders"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-5305-7307","authenticated-orcid":false,"given":"Alexandre","family":"Eyma\u00ebl","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1752-1195","authenticated-orcid":false,"given":"Renaud","family":"Vandeghen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5314-9015","authenticated-orcid":false,"given":"Anthony","family":"Cioppa","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3937-9834","authenticated-orcid":false,"given":"Silvio","family":"Giancola","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5534-587X","authenticated-orcid":false,"given":"Bernard","family":"Ghanem","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6260-6487","authenticated-orcid":false,"given":"Marc","family":"Van Droogenbroeck","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,10,31]]},"reference":[{"key":"20_CR1","doi-asserted-by":"publisher","unstructured":"Balestriero, R., et\u00a0al.: A cookbook of self-supervised learning. CoRR abs\/2304.12210 (2023). https:\/\/doi.org\/10.48550\/arXiv.2304.12210","DOI":"10.48550\/arXiv.2304.12210"},{"key":"20_CR2","doi-asserted-by":"publisher","unstructured":"Bandara, W.G.C., Patel, N., Gholami, A., Nikkhah, M., Agrawal, M., Patel, V.M.: AdaMAE: adaptive masking for efficient spatiotemporal learning with masked autoencoders. In: IEEE\/CVF Conference on Computing Vision Pattern Recognition (CVPR), Vancouver, Canada, pp. 14507\u201314517. Institute of Electrical and Electronics Engineers (IEEE) (2023). https:\/\/doi.org\/10.1109\/cvpr52729.2023.01394","DOI":"10.1109\/cvpr52729.2023.01394"},{"key":"20_CR3","unstructured":"Bao, H., Dong, L., Piao, S., Wei, F.: BEiT: BERT pre-training of image transformers. In: International Conference on Learning Representations (ICLR) (2022). https:\/\/openreview.net\/forum?id=p-BhZSz59o4"},{"key":"20_CR4","doi-asserted-by":"publisher","unstructured":"Bao, Z., Tokmakov, P., Jabri, A., Wang, Y.X., Gaidon, A., Hebert, M.: Discovering objects that can move. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), New Orleans, LA, USA, pp. 11779\u201311788. Institute of Electrical and Electronics Engineers (IEEE) (2022). https:\/\/doi.org\/10.1109\/cvpr52688.2022.01149","DOI":"10.1109\/cvpr52688.2022.01149"},{"key":"20_CR5","unstructured":"Bromley, J., Guyon, I., LeCun, Y., S\u00e4ckinger, E., Shah, R.: Signature verification using a \u201cSiamese\u201d time delay neural network. In: Cowan, J., Tesauro, G., Alspector, J. (eds.) Advances in Neural Information Processing Systems. vol.\u00a06. Morgan-Kaufmann (1993). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/1993\/file\/288cc0ff022877bd3df94bc9360b9c5d-Paper.pdf"},{"key":"20_CR6","doi-asserted-by":"publisher","unstructured":"Caron, M., et al.: Emerging properties in self-supervised vision transformers. In: IEEE\/CVF International Conference on Computer Vision (ICCV), Montreal, QC, Canada, pp. 9630\u20139640. Institute of Electrical and Electronics Engineers (IEEE) (2021). https:\/\/doi.org\/10.1109\/iccv48922.2021.00951","DOI":"10.1109\/iccv48922.2021.00951"},{"key":"20_CR7","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: International Conference on Machine Learning (ICML). Proceedings of Machine Learning Research, vol.\u00a0119, pp. 1597\u20131607 (2020)"},{"issue":"1","key":"20_CR8","doi-asserted-by":"publisher","first-page":"208","DOI":"10.1007\/s11263-023-01852-4","volume":"132","author":"X Chen","year":"2023","unstructured":"Chen, X., et al.: Context autoencoder for self-supervised representation learning. Int. J. Comput. Vis. 132(1), 208\u2013223 (2023). https:\/\/doi.org\/10.1007\/s11263-023-01852-4","journal-title":"Int. J. Comput. Vis."},{"key":"20_CR9","doi-asserted-by":"publisher","unstructured":"Chen, X., He, K.: Exploring simple Siamese representation learning. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), Nashville, TN, USA, pp. 15745\u201315753. Institute of Electrical and Electronics Engineers (IEEE) (2021). https:\/\/doi.org\/10.1109\/cvpr46437.2021.01549","DOI":"10.1109\/cvpr46437.2021.01549"},{"key":"20_CR10","doi-asserted-by":"publisher","unstructured":"Dave, I., Gupta, R., Rizve, M.N., Shah, M.: TCLR: temporal contrastive learning for video representation. Comput. Vis. Image Underst. 219, 1\u20139 (2022). https:\/\/doi.org\/10.1016\/j.cviu.2022.103406","DOI":"10.1016\/j.cviu.2022.103406"},{"key":"20_CR11","doi-asserted-by":"publisher","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), Miami, FL, USA, pp. 248\u2013255. Institute of Electrical and Electronics Engineers (IEEE) (2009). https:\/\/doi.org\/10.1109\/CVPR.2009.5206848","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"20_CR12","doi-asserted-by":"publisher","unstructured":"Doersch, C., Zisserman, A.: Multi-task self-supervised visual learning. In: IEEE International Conference on Computer Vision (ICCV), Venice, Italy, pp. 2070\u20132079. Institute of Electrical and Electronics Engineers (IEEE) (2017). https:\/\/doi.org\/10.1109\/iccv.2017.226","DOI":"10.1109\/iccv.2017.226"},{"key":"20_CR13","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. In: International Conference on Learning Representations (ICLR), Austria (2021)"},{"issue":"2","key":"20_CR14","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1007\/s11263-009-0275-4","volume":"88","author":"M Everingham","year":"2010","unstructured":"Everingham, M., Van Gool, L., Williams, C.K.I., Winn, J., Zisserman, A.: The PASCAL visual object classes (VOC) challenge. Int. J. Comput. Vis. 88(2), 303\u2013338 (2010). https:\/\/doi.org\/10.1007\/s11263-009-0275-4","journal-title":"Int. J. Comput. Vis."},{"key":"20_CR15","doi-asserted-by":"publisher","unstructured":"Fan, D., et al.: Motion-guided masking for spatiotemporal representation learning. In: IEEE\/CVF International Conference on Computer Vision (ICCV), Paris, France, pp. 5596\u20135606. Institute of Electrical and Electronics Engineers (IEEE) (2023). https:\/\/doi.org\/10.1109\/iccv51070.2023.00517","DOI":"10.1109\/iccv51070.2023.00517"},{"key":"20_CR16","unstructured":"Feichtenhofer, C., FFan, H., Li, Y., He, K.: Masked autoencoders as spatiotemporal learners. In: Advances in Neural Information Processing Systems (NeurIPS), vol.\u00a035, pp. 35946\u201335958. Curran Assoc. Inc. (2022), https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2022\/file\/e97d1081481a4017df96b51be31001d3-Paper-Conference.pdf"},{"key":"20_CR17","doi-asserted-by":"publisher","unstructured":"Feng, Z., Zhang, S.: Evolved part masking for self-supervised learning. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10386\u201310395. Institute of Electrical and Electronics Engineers (IEEE), Vancouver, Canada (2023). https:\/\/doi.org\/10.1109\/cvpr52729.2023.01001","DOI":"10.1109\/cvpr52729.2023.01001"},{"key":"20_CR18","doi-asserted-by":"publisher","unstructured":"da Girdhar, R., El-Nouby, A., Singh, M., Alwala, K.V., Joulin, A., Misra, I.: OmniMAE: single model masked pretraining on images and videos. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), Vancouver, Canada, pp. 10406\u201310417. Institute of Electrical and Electronics Engineers (IEEE) (2023). https:\/\/doi.org\/10.1109\/cvpr52729.2023.01003","DOI":"10.1109\/cvpr52729.2023.01003"},{"key":"20_CR19","unstructured":"Girdhar, R., Ramanan, D.: Attentional pooling for action recognition. In: Advances in Neural Information Processing Systems (NeurIPS), Long Beach, CA, USA, vol.\u00a030, pp. 1\u201312. Curran Assoc. Inc. (2017)"},{"key":"20_CR20","unstructured":"Grill, J.B., et al.: Bootstrap your own latent \u2013 a new approach to self-supervised learning. In: Advances in Neural Information Processing Systems (NeurIPS), vol.\u00a033, pp. 21271\u201321284. Curran Assoc. Inc. (2020)"},{"key":"20_CR21","unstructured":"Gupta, A., Wu, J., Deng, J., Fei-Fei, L.: Siamese masked autoencoders. In: Advances in Neural Information Processing Systems (NeurIPS), New Orleans, LA, USA, vol.\u00a037. Curran Assoc. Inc. (2023). https:\/\/openreview.net\/forum?id=yC3q7vInux"},{"key":"20_CR22","doi-asserted-by":"publisher","unstructured":"Hadsell, R., Chopra, S., LeCun, Y.: Dimensionality reduction by learning an invariant mapping. In: IEEE International Conference on Computer Vision and Pattern Recognition (CVPR), New York, NY, USA, vol.\u00a02, pp. 1735\u20131742. Institute of Electrical and Electronics Engineers (IEEE) (2019). https:\/\/doi.org\/10.1109\/cvpr.2006.100","DOI":"10.1109\/cvpr.2006.100"},{"key":"20_CR23","doi-asserted-by":"publisher","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Dollar, P., Girshick, R.: Masked autoencoders are scalable vision learners. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), New Orleans, LA, USA, pp. 15979\u201315988. Institute of Electrical and Electronics Engineers (IEEE) (2022). https:\/\/doi.org\/10.1109\/cvpr52688.2022.01553","DOI":"10.1109\/cvpr52688.2022.01553"},{"key":"20_CR24","doi-asserted-by":"publisher","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., Girshick, R.: Momentum contrast for unsupervised visual representation learning. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), Seattle, WA, USA, pp. 9726\u20139735. Institute of Electrical and Electronics Engineers (IEEE) (2020). https:\/\/doi.org\/10.1109\/cvpr42600.2020.00975","DOI":"10.1109\/cvpr42600.2020.00975"},{"key":"20_CR25","doi-asserted-by":"publisher","unstructured":"Hendrycks, D., Gimpel, K.: Gaussian error linear units (GELUs). CoRR abs\/1606.08415 (2016). https:\/\/doi.org\/10.48550\/arXiv.1606.08415","DOI":"10.48550\/arXiv.1606.08415"},{"key":"20_CR26","unstructured":"Jabri, A., Owens, A., Efros, A.A.: Space-time correspondence as a contrastive random walk. In: Advances in Neural Information Processing Systems (NeurIPS). vol.\u00a034. Curran Assoc. Inc. (2020)"},{"key":"20_CR27","doi-asserted-by":"publisher","unstructured":"Jhuang, H., Gall, J., Zuffi, S., Schmid, C., Black, M.J.: Towards understanding action recognition. In: IEEE International Conference on Computer Vision (ICCV), Sydney, NSW, Aust, pp. 3192\u20133199. Institute of Electrical and Electronics Engineers (IEEE) (2013). https:\/\/doi.org\/10.1109\/iccv.2013.396","DOI":"10.1109\/iccv.2013.396"},{"key":"20_CR28","doi-asserted-by":"publisher","unstructured":"Jiang, Z., et al.: Concatenated masked autoencoders as spatial-temporal learner. CoRR abs\/2311.00961 (2023). https:\/\/doi.org\/10.48550\/arXiv.2311.00961","DOI":"10.48550\/arXiv.2311.00961"},{"key":"20_CR29","doi-asserted-by":"publisher","unstructured":"Kay, W., et al.: The kinetics human action video dataset. CoRR abs\/1705.06950 (2017). https:\/\/doi.org\/10.48550\/arXiv.1705.06950","DOI":"10.48550\/arXiv.1705.06950"},{"key":"20_CR30","unstructured":"Kenton, L., Devlin, J., Chang, M.W., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of NAACL-HLT, vol.\u00a01, pp. 4171\u20134186. Minneapolis, Minnesota (2019)"},{"key":"20_CR31","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: International Conference on Learning Representations (ICLR), New Orleans, LA, USA (2019)"},{"key":"20_CR32","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"527","DOI":"10.1007\/978-3-319-46448-0_32","volume-title":"Computer Vision \u2013 ECCV 2016","author":"I Misra","year":"2016","unstructured":"Misra, I., Zitnick, C.L., Hebert, M.: Shuffle and learn: unsupervised learning using temporal order verification. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 527\u2013544. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_32"},{"key":"20_CR33","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"69","DOI":"10.1007\/978-3-319-46466-4_5","volume-title":"Computer Vision \u2013 ECCV 2016","author":"M Noroozi","year":"2016","unstructured":"Noroozi, M., Favaro, P.: Unsupervised learning of visual representations by solving jigsaw puzzles. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9910, pp. 69\u201384. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46466-4_5"},{"key":"20_CR34","unstructured":"Oquab, M., et al.: DINOv2: learning robust visual features without supervision. Trans. Mach. Learn. Res. (2024). https:\/\/openreview.net\/forum?id=a68SUt6zFt"},{"key":"20_CR35","doi-asserted-by":"publisher","unstructured":"Pont-Tuset, J., Perazzi, F., Caelles, S., Arbel\u00e1ez, P., Sorkine-Hornung, A., Van\u00a0Gool, L.: The 2017 DAVIS challenge on video object segmentation. CoRR abs\/1704.00675 (2017). https:\/\/doi.org\/10.48550\/arXiv.1704.00675","DOI":"10.48550\/arXiv.1704.00675"},{"key":"20_CR36","doi-asserted-by":"publisher","unstructured":"Qing, Z., et al.: MAR: masked autoencoders for efficient action recognition. IEEE Trans. Multimedia 26, 218\u2013233 (2024). https:\/\/doi.org\/10.1109\/tmm.2023.3263288","DOI":"10.1109\/tmm.2023.3263288"},{"issue":"3","key":"20_CR37","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., et al.: ImageNet large scale visual recognition challenge. Int. J. Comput. Vis. 115(3), 211\u2013252 (2015). https:\/\/doi.org\/10.1007\/s11263-015-0816-y","journal-title":"Int. J. Comput. Vis."},{"key":"20_CR38","doi-asserted-by":"publisher","unstructured":"Sermanet, P., et al.: Time-contrastive networks: self-supervised learning from video. In: IEEE International Conference on Robotics and Automation (ICRA), Brisbane, QLD, Australia, pp. 1134\u20131141. Institute of Electrical and Electronics Engineers (IEEE) (2018). https:\/\/doi.org\/10.1109\/icra.2018.8462891","DOI":"10.1109\/icra.2018.8462891"},{"key":"20_CR39","unstructured":"Spyros, G., Praveer, S., Nikos, K.: Unsupervised representation learning by predicting image rotations. In: International Conference on Learning Representations (ICLR), Vancouver, Canada (2018). https:\/\/openreview.net\/forum?id=S1v4N2l0-"},{"issue":"1","key":"20_CR40","first-page":"1929","volume":"15","author":"N Srivastava","year":"2014","unstructured":"Srivastava, N., Hinton, G., Krizhevsky, A., Sutskever, I., Salakhutdinov, R.: Dropout: a simple way to prevent neural networks from overfitting. J. Mach. Learn. Res. 15(1), 1929\u20131958 (2014)","journal-title":"J. Mach. Learn. Res."},{"key":"20_CR41","unstructured":"Tong, Z., Song, Y., Wang, J., Wang, L.: VideoMAE: masked autoencoders are data-efficient learners for self-supervised video pre-training. In: Advances in Neural Information Processing Systems (NeurIPS), vol.\u00a035, pp. 10078\u201310093. Curran Assoc. Inc. (2022)"},{"key":"20_CR42","doi-asserted-by":"publisher","unstructured":"Vaswani, A., et al.: Attention is all you need. CoRR abs\/1706.03762 (2017). https:\/\/doi.org\/10.48550\/arXiv.1706.03762","DOI":"10.48550\/arXiv.1706.03762"},{"key":"20_CR43","doi-asserted-by":"publisher","unstructured":"Vincent, P., Larochelle, H., Bengio, Y., Manzagol, P.A.: Extracting and composing robust features with denoising autoencoders. In: Proceedings of the 25th International Conference on Machine Learning - ICML 2008, Helsinki, Finland, pp. 1096\u20131103. ACM Press (2008). https:\/\/doi.org\/10.1145\/1390156.1390294","DOI":"10.1145\/1390156.1390294"},{"key":"20_CR44","doi-asserted-by":"publisher","unstructured":"Wang, L., et al.: VideoMAE V2: scaling video masked autoencoders with dual masking. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), Vancouver, Canada, pp. 14549\u201314560. Institute of Electrical and Electronics Engineers (IEEE) (2023). https:\/\/doi.org\/10.1109\/cvpr52729.2023.01398","DOI":"10.1109\/cvpr52729.2023.01398"},{"key":"20_CR45","doi-asserted-by":"publisher","unstructured":"Wang, X., Jabri, A., Efros, A.A.: Learning correspondence from the cycle-consistency of time. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), Long Beach, CA, USA, pp. 2561\u20132571. Institute of Electrical and Electronics Engineers (IEEE) (2019). https:\/\/doi.org\/10.1109\/cvpr.2019.00267","DOI":"10.1109\/cvpr.2019.00267"},{"key":"20_CR46","doi-asserted-by":"publisher","unstructured":"Wu, Z., Xiong, Y., Yu, S.X., Lin, D.: Unsupervised feature learning via non-parametric instance discrimination. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), Salt Lake City, UT, USA. pp. 3733\u20133742. Institute of Electrical and Electronics Engineers (IEEE) (2018). https:\/\/doi.org\/10.1109\/cvpr.2018.00393","DOI":"10.1109\/cvpr.2018.00393"},{"key":"20_CR47","unstructured":"Xiao, T., Wang, X., Efros, A.A., Darrell, T.: What should not be contrastive in contrastive learning. In: International Conference on Learning Representations (ICLR), Vienna, Austria (2021)"},{"key":"20_CR48","doi-asserted-by":"publisher","unstructured":"Xie, R., Wang, C., Zeng, W., Wang, Y.: An empirical study of the collapsing problem in semi-supervised 2D human pose estimation. In: IEEE\/CVF International Conference on Computer Vision (ICCV), Montreal, QC, Canada, pp. 11220\u201311229. Institute of Electrical and Electronics Engineers (IEEE) (2021). https:\/\/doi.org\/10.1109\/iccv48922.2021.01105","DOI":"10.1109\/iccv48922.2021.01105"},{"key":"20_CR49","doi-asserted-by":"publisher","unstructured":"Xie, Z., et al.: SimMIM: a simple framework for masked image modeling. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), New Orleans, LA, USA, pp. 9643\u20139653. Institute of Electrical and Electronics Engineers (IEEE) (2022). https:\/\/doi.org\/10.1109\/cvpr52688.2022.00943","DOI":"10.1109\/cvpr52688.2022.00943"},{"key":"20_CR50","doi-asserted-by":"publisher","unstructured":"Yao, R., Lin, G., Xia, S., Zhao, J., Zhou, Y.: Video object segmentation and tracking: a survey. ACM Trans. Intell. Syst. Technol. 11(4), 36:1\u201347 (2020). https:\/\/doi.org\/10.1145\/3391743","DOI":"10.1145\/3391743"},{"key":"20_CR51","unstructured":"Zhou, J., et al.: iBOT: Image bert pre-training with online tokenizer. In: International Conference on Learning Representations (ICLR), Vienna, Austria (2022). https:\/\/openreview.net\/forum?id=ydopy-e6Dg"},{"key":"20_CR52","doi-asserted-by":"publisher","unstructured":"Zhou, Q., Liang, X., Gong, K., Lin, L.: Adaptive temporal encoding network for video instance-level human parsing. In: Proceedings of the 26th ACM International Conference on Multimedia, pp. 1527\u20131535. ACM (2018). https:\/\/doi.org\/10.1145\/3240508.3240660","DOI":"10.1145\/3240508.3240660"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73337-6_20","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T23:06:26Z","timestamp":1730329586000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73337-6_20"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,31]]},"ISBN":["9783031733369","9783031733376"],"references-count":52,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73337-6_20","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,31]]},"assertion":[{"value":"31 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}