{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,24]],"date-time":"2026-03-24T07:46:51Z","timestamp":1774338411077,"version":"3.50.1"},"publisher-location":"Singapore","reference-count":26,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819756087","type":"print"},{"value":"9789819756094","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-981-97-5609-4_9","type":"book-chapter","created":{"date-parts":[[2024,7,30]],"date-time":"2024-07-30T12:02:04Z","timestamp":1722340924000},"page":"114-130","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Spatial-Temporal Transformer Network for Continuous Action Recognition in Industrial Assembly"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1630-5322","authenticated-orcid":false,"given":"Jianfeng","family":"Huang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-2492-403X","authenticated-orcid":false,"given":"Xiang","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-6697-3220","authenticated-orcid":false,"given":"Huan","family":"Hu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shanghua","family":"Tang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-6841-9660","authenticated-orcid":false,"given":"Chenyang","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shaoan","family":"Zhao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8096-4819","authenticated-orcid":false,"given":"Yimin","family":"Lin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1171-0281","authenticated-orcid":false,"given":"Kai","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1267-0277","authenticated-orcid":false,"given":"Zhaoxiang","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4308-7049","authenticated-orcid":false,"given":"Shiguo","family":"Lian","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,7,31]]},"reference":[{"issue":"4","key":"9_CR1","doi-asserted-by":"publisher","first-page":"2046","DOI":"10.1177\/0954406220931547","volume":"236","author":"M Al-Amin","year":"2022","unstructured":"Al-Amin, M., et al.: Fusing and refining convolutional neural network models for assembly action recognition in smart manufacturing. Proc. Inst. Mech. Eng. C J. Mech. Eng. Sci. 236(4), 2046\u20132059 (2022)","journal-title":"Proc. Inst. Mech. Eng. C J. Mech. Eng. Sci."},{"key":"9_CR2","doi-asserted-by":"crossref","unstructured":"Arnab, A., Dehghani, M., Heigold, G., Sun, C., Lu\u010di\u0107, M., Schmid, C.: ViviT: a video vision transformer. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6836\u20136846 (2021)","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"9_CR3","unstructured":"Assis, D., Ayala, N.F., Frank, A.G.: Smart working in industry 4.0: how digital technologies enhance manufacturing workers\u2019 activities (2021)"},{"key":"9_CR4","doi-asserted-by":"crossref","unstructured":"Chao, Y.W., Wang, Z., He, Y., Wang, J., Deng, J.: Hico: a benchmark for recognizing human-object interactions in images. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1017\u20131025 (2015)","DOI":"10.1109\/ICCV.2015.122"},{"issue":"15","key":"9_CR5","doi-asserted-by":"publisher","first-page":"4208","DOI":"10.3390\/s20154208","volume":"20","author":"C Chen","year":"2020","unstructured":"Chen, C., et al.: Monitoring of assembly process using deep learning technology. Sensors 20(15), 4208 (2020)","journal-title":"Sensors"},{"key":"9_CR6","doi-asserted-by":"crossref","unstructured":"Chiou, M.J., Liao, C.Y., Wang, L.W., Zimmermann, R., Feng, J.: St-hoi: a spatial-temporal baseline for human-object interaction detection in videos. In: Proceedings of the 2021 Workshop on Intelligent Cross-Data Analysis and Retrieval, pp. 9\u201317 (2021)","DOI":"10.1145\/3463944.3469097"},{"key":"9_CR7","doi-asserted-by":"publisher","first-page":"1254","DOI":"10.1016\/j.procir.2019.03.303","volume":"81","author":"M Faccio","year":"2019","unstructured":"Faccio, M., Ferrari, E., Galizia, F.G., Gamberi, M., Pilati, F.: Real-time assistance to manual assembly through depth camera and visual feedback. Procedia CIRP 81, 1254\u20131259 (2019)","journal-title":"Procedia CIRP"},{"key":"9_CR8","doi-asserted-by":"crossref","unstructured":"Ji, J., Desai, R., Niebles, J.C.: Detecting human-object relationships in videos. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 8106\u20138116 (2021)","DOI":"10.1109\/ICCV48922.2021.00800"},{"key":"9_CR9","doi-asserted-by":"crossref","unstructured":"Kim, B., Lee, J., Kang, J., Kim, E.S., Kim, H.J.: Hotr: end-to-end human-object interaction detection with transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 74\u201383 (2021)","DOI":"10.1109\/CVPR46437.2021.00014"},{"key":"9_CR10","doi-asserted-by":"crossref","unstructured":"Liao, Y., Zhang, A., Lu, M., Wang, Y., Li, X., Liu, S.: Gen-vlkt: simplify association and enhance interaction understanding for hoi detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 20123\u201320132 (2022)","DOI":"10.1109\/CVPR52688.2022.01949"},{"issue":"12","key":"9_CR11","doi-asserted-by":"publisher","first-page":"9703","DOI":"10.1109\/TIE.2018.2884206","volume":"66","author":"L Liu","year":"2018","unstructured":"Liu, L., Liu, Y., Zhang, J.: Learning-based hand motion capture and understanding in assembly process. IEEE Trans. Industr. Electron. 66(12), 9703\u20139712 (2018)","journal-title":"IEEE Trans. Industr. Electron."},{"key":"9_CR12","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Video swin transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3202\u20133211 (2022)","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"9_CR13","doi-asserted-by":"crossref","unstructured":"Mohsen, S., Elkaseer, A., Scholz, S.G.: Industry 4.0-oriented deep learning models for human activity recognition. IEEE Access 9, 150508\u2013150521 (2021)","DOI":"10.1109\/ACCESS.2021.3125733"},{"key":"9_CR14","doi-asserted-by":"publisher","first-page":"582","DOI":"10.1016\/j.procir.2022.05.029","volume":"107","author":"M M\u00fchlbauer","year":"2022","unstructured":"M\u00fchlbauer, M., Kutzner, K., Sommer, A., W\u00fcrschinger, H., Hanenkamp, N.: An approach to progress monitoring of industrial manual processes based on camera recordings and object interactions. Procedia CIRP 107, 582\u2013587 (2022)","journal-title":"Procedia CIRP"},{"key":"9_CR15","doi-asserted-by":"crossref","unstructured":"Qi, S., Wang, W., Jia, B., Shen, J., Zhu, S.C.: Learning human-object interactions by graph parsing neural networks. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 401\u2013417 (2018)","DOI":"10.1007\/978-3-030-01240-3_25"},{"issue":"4","key":"9_CR16","first-page":"393","volume":"16","author":"A Riedel","year":"2021","unstructured":"Riedel, A., et al.: A deep learning-based worker assistance system for error prevention: case study in a real-world manual assembly. Adv. Prod. Eng. Manage. 16(4), 393\u2013404 (2021)","journal-title":"Adv. Prod. Eng. Manage."},{"key":"9_CR17","doi-asserted-by":"crossref","unstructured":"Shang, X., Di, D., Xiao, J., Cao, Y., Yang, X., Chua, T.S.: Annotating objects and relations in user-generated videos. In: Proceedings of the 2019 on International Conference on Multimedia Retrieval, pp. 279\u2013287 (2019)","DOI":"10.1145\/3323873.3325056"},{"key":"9_CR18","doi-asserted-by":"publisher","DOI":"10.1016\/j.engappai.2020.103868","volume":"95","author":"W Tao","year":"2020","unstructured":"Tao, W., Leu, M.C., Yin, Z.: Multi-modal recognition of worker activity for human-centered intelligent manufacturing. Eng. Appl. Artif. Intell. 95, 103868 (2020)","journal-title":"Eng. Appl. Artif. Intell."},{"issue":"2","key":"9_CR19","doi-asserted-by":"publisher","first-page":"64","DOI":"10.1145\/2812802","volume":"59","author":"B Thomee","year":"2016","unstructured":"Thomee, B., et al.: Yfcc100m: the new data in multimedia research. Commun. ACM 59(2), 64\u201373 (2016)","journal-title":"Commun. ACM"},{"key":"9_CR20","unstructured":"Vaswani, A., et al.: Attention is all you need. Adv. Neural Inf. Proc. Syst. 30 (2017)"},{"key":"9_CR21","doi-asserted-by":"crossref","unstructured":"Wan, B., Zhou, D., Liu, Y., Li, R., He, X.: Pose-aware multi-level feature network for human object interaction detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9469\u20139478 (2019)","DOI":"10.1109\/ICCV.2019.00956"},{"key":"9_CR22","first-page":"12077","volume":"34","author":"E Xie","year":"2021","unstructured":"Xie, E., Wang, W., Yu, Z., Anandkumar, A., Alvarez, J.M., Luo, P.: Segformer: simple and efficient design for semantic segmentation with transformers. Adv. Neural. Inf. Process. Syst. 34, 12077\u201312090 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"9_CR23","doi-asserted-by":"crossref","unstructured":"Yan, S., et al.: Multiview transformers for video recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3333\u20133343 (2022)","DOI":"10.1109\/CVPR52688.2022.00333"},{"key":"9_CR24","doi-asserted-by":"crossref","unstructured":"Yuan, H., Wang, M., Ni, D., Xu, L.: Detecting human-object interactions with object-guided cross-modal calibrated semantics. arXiv preprint arXiv:2202.00259 (2022)","DOI":"10.1609\/aaai.v36i3.20229"},{"key":"9_CR25","doi-asserted-by":"crossref","unstructured":"Zeng, R., et al.: Graph convolutional networks for temporal action localization. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7094\u20137103 (2019)","DOI":"10.1109\/ICCV.2019.00719"},{"key":"9_CR26","doi-asserted-by":"crossref","unstructured":"Zheng, Z., Wang, P., Liu, W., Li, J., Ye, R., Ren, D.: Distance-iou loss: Faster and better learning for bounding box regression. In: Proceedings of the AAAI Conference on Artificial Intelligence. vol. 34, pp. 12993\u201313000 (2020)","DOI":"10.1609\/aaai.v34i07.6999"}],"container-title":["Lecture Notes in Computer Science","Advanced Intelligent Computing Technology and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-97-5609-4_9","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,7,30]],"date-time":"2024-07-30T12:04:00Z","timestamp":1722341040000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-97-5609-4_9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9789819756087","9789819756094"],"references-count":26,"URL":"https:\/\/doi.org\/10.1007\/978-981-97-5609-4_9","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"31 July 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICIC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Intelligent Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tianjin","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 August 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 August 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"20","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icic2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/www.ic-icc.cn\/2024\/index.htm","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}