{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,28]],"date-time":"2026-03-28T17:23:03Z","timestamp":1774718583457,"version":"3.50.1"},"publisher-location":"Singapore","reference-count":25,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819629138","type":"print"},{"value":"9789819629145","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-96-2914-5_20","type":"book-chapter","created":{"date-parts":[[2025,3,10]],"date-time":"2025-03-10T09:35:29Z","timestamp":1741599329000},"page":"211-222","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Multi-modal Spatio-temporal Transformer for Defect Recognition of Substation Equipment"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-3427-3169","authenticated-orcid":false,"given":"Yiyang","family":"Yao","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0954-2757","authenticated-orcid":false,"given":"Zexing","family":"Du","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0003-5224-906X","authenticated-orcid":false,"given":"Xue","family":"Wang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3439-0644","authenticated-orcid":false,"given":"Qing","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,3,11]]},"reference":[{"issue":"15","key":"20_CR1","doi-asserted-by":"publisher","first-page":"6821","DOI":"10.3390\/s23156821","volume":"23","author":"RG Kim","year":"2023","unstructured":"Kim, R.G., Abisado, M., Villaverde, J., Sampedro, G.A.: A survey of image-based fault monitoring in additive manufacturing: recent developments and future directions. Sensors 23(15), 6821 (2023)","journal-title":"Sensors"},{"issue":"2","key":"20_CR2","doi-asserted-by":"publisher","first-page":"924","DOI":"10.1109\/TGRS.2018.2863224","volume":"57","author":"L Mou","year":"2019","unstructured":"Mou, L., Bruzzone, L., Zhu, X.X.: Learning spectral-spatial-temporal features via a recurrent convolutional neural network for change detection in multispectral imagery. IEEE Trans. Geosci. Remote Sens. 57(2), 924\u2013935 (2019)","journal-title":"IEEE Trans. Geosci. Remote Sens."},{"key":"20_CR3","doi-asserted-by":"publisher","first-page":"01022","DOI":"10.1051\/bioconf\/20236801022","volume":"68","author":"J Ara\u00fajo","year":"2023","unstructured":"Ara\u00fajo, J., et al.: Satellite and UAV-based anomaly detection in vineyards. BIO Web Conf. 68, 01022 (2023)","journal-title":"BIO Web Conf."},{"key":"20_CR4","doi-asserted-by":"crossref","unstructured":"Ji, W., et al.: Multispectral video semantic segmentation: a benchmark dataset and baseline. In: CVPR, pp. 1094\u20131104 (2023)","DOI":"10.1109\/CVPR52729.2023.00112"},{"key":"20_CR5","volume":"122","author":"H Wei","year":"2023","unstructured":"Wei, H., et al.: Real-time remote sensing detection framework of the earth\u2019s surface anomalies based on a priori knowledge base. Int. J. Appl. Earth Obs. Geoinf. 122, 103429 (2023)","journal-title":"Int. J. Appl. Earth Obs. Geoinf."},{"issue":"13","key":"20_CR6","doi-asserted-by":"publisher","first-page":"3014","DOI":"10.3390\/s19133014","volume":"19","author":"B Jalil","year":"2019","unstructured":"Jalil, B., Leone, G.R., Martinelli, M., Moroni, D., Pascali, M.A., Berton, A.: Fault detection in power equipment via an unmanned aerial system using multi modal data. Sensors 19(13), 3014 (2019)","journal-title":"Sensors"},{"key":"20_CR7","doi-asserted-by":"publisher","first-page":"149999","DOI":"10.1109\/ACCESS.2020.3016213","volume":"8","author":"S Kim","year":"2020","unstructured":"Kim, S., Kim, D., Jeong, S., Ham, J.W., Lee, J.K., Oh, K.Y.: Fault diagnosis of power transmission lines using a UAV-mounted smart inspection system. IEEE Access 8, 149999\u2013150009 (2020)","journal-title":"IEEE Access"},{"key":"20_CR8","doi-asserted-by":"publisher","DOI":"10.1088\/1757-899X\/231\/1\/012062","volume":"231","author":"Q Lu","year":"2017","unstructured":"Lu, Q., Ding, K.: Research on fault detection systems of power equipment based on UV and infrared image. IOP Conf. Ser. Mater. Sci. Eng. 231, 012062 (2017)","journal-title":"IOP Conf. Ser. Mater. Sci. Eng."},{"key":"20_CR9","doi-asserted-by":"crossref","unstructured":"Lai, X., Deng, Z., Jiang, J., Wang, Y., Chen, Z., Yao, Y.: Tri-modality fusion imaging system for the electric power industry. In: IEEE International Conference on Power, Intelligent Computing and Systems, pp. 301\u2013305 (2022)","DOI":"10.1109\/ICPICS55264.2022.9873623"},{"key":"20_CR10","first-page":"194","volume":"10","author":"L Cai","year":"2021","unstructured":"Cai, L., et al.: High-precision temperature measurement and calibration technology of infrared thermal imager. Infrared Laser Eng. 10, 194\u2013201 (2021)","journal-title":"Infrared Laser Eng."},{"key":"20_CR11","doi-asserted-by":"publisher","DOI":"10.1016\/j.ijepes.2022.108567","volume":"144","author":"Z Xing","year":"2023","unstructured":"Xing, Z., He, Y.: Multi-modal information analysis for fault diagnosis with time series data from power transformer. Int. J. Electr. Power Energy Syst. 144, 108567 (2023)","journal-title":"Int. J. Electr. Power Energy Syst."},{"key":"20_CR12","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"20_CR13","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. In: International Conference on Learning Representations (2021)"},{"key":"20_CR14","doi-asserted-by":"publisher","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"20_CR15","unstructured":"Liu, Z., et al.: Video swin transformer. In: CVPR, pp. 3202\u20133211 (2022)"},{"key":"20_CR16","doi-asserted-by":"publisher","first-page":"4002","DOI":"10.1109\/TIP.2024.3413599","volume":"33","author":"Y Tang","year":"2024","unstructured":"Tang, Y., Wang, W., Zhang, C., Liu, J., Zhao, Y.: Learnable feature augmentation framework for temporal action localization. IEEE Trans. Image Process. 33, 4002\u20134015 (2024)","journal-title":"IEEE Trans. Image Process."},{"issue":"10","key":"20_CR17","doi-asserted-by":"publisher","first-page":"12113","DOI":"10.1109\/TPAMI.2023.3275156","volume":"45","author":"P Xu","year":"2023","unstructured":"Xu, P., Zhu, X., Clifton, D.A.: Multimodal learning with transformers: a survey. IEEE Trans. Pattern Anal. Mach. Intell. 45(10), 12113\u201312132 (2023)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"20_CR18","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2023.110084","volume":"147","author":"C Chen","year":"2024","unstructured":"Chen, C., Han, D., Chang, C.C.: MPCCT: multimodal vision-language learning paradigm with context-based compact transformer. Pattern Recogn. 147, 110084 (2024)","journal-title":"Pattern Recogn."},{"key":"20_CR19","doi-asserted-by":"crossref","unstructured":"Srivastava, S., Sharma, G.: OmniVec2 - a novel transformer based network for large scale multimodal and multitask learning. In: CVPR, pp. 27412\u201327424 (2024)","DOI":"10.1109\/CVPR52733.2024.02588"},{"key":"20_CR20","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763 (2021)"},{"key":"20_CR21","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: Proceedings of the International Conference on Machine Learning, pp. 1597\u20131607 (2020)"},{"key":"20_CR22","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"20_CR23","doi-asserted-by":"publisher","unstructured":"Wang, L., et al.: Temporal segment networks: towards good practices for deep action recognition. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9912, pp. 20\u201336. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46484-8_2","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"20_CR24","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? A new model and the kinetics dataset. In: CVPR, pp. 6299\u20136308 (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"20_CR25","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: MViTv2: improved multiscale vision transformers for classification and detection. In: CVPR, pp. 4804\u20134814 (2022)","DOI":"10.1109\/CVPR52688.2022.00476"}],"container-title":["Communications in Computer and Information Science","Artificial Intelligence and Robotics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-2914-5_20","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,10]],"date-time":"2025-03-10T09:35:38Z","timestamp":1741599338000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-2914-5_20"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9789819629138","9789819629145"],"references-count":25,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-2914-5_20","relation":{},"ISSN":["1865-0929","1865-0937"],"issn-type":[{"value":"1865-0929","type":"print"},{"value":"1865-0937","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"11 March 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ISAIR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Symposium on Artificial Intelligence and Robotics","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Guilin","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"30 September 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"9","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"isair2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/isair.site\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}