{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T02:53:24Z","timestamp":1768272804726,"version":"3.49.0"},"publisher-location":"Singapore","reference-count":30,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819556755","type":"print"},{"value":"9789819556762","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-5676-2_11","type":"book-chapter","created":{"date-parts":[[2026,1,12]],"date-time":"2026-01-12T20:32:04Z","timestamp":1768249924000},"page":"158-173","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["A Two-Stage Multimodal Framework for\u00a0Real-Time Item Pickup and\u00a0Return Recognition in\u00a0Unmanned Retail Stores"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-1621-2343","authenticated-orcid":false,"given":"Shenghong","family":"Zhong","sequence":"first","affiliation":[]},{"given":"Bi","family":"Zeng","sequence":"additional","affiliation":[]},{"given":"Jinjie","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Yujun","family":"Zhu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,1,13]]},"reference":[{"key":"11_CR1","doi-asserted-by":"crossref","unstructured":"Chen, Y., Zhang, Z., Yuan, C., Li, B., Deng, Y., Hu, W.: Channel-wise topology refinement graph convolution for skeleton-based action recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 13359\u201313368 (2021)","DOI":"10.1109\/ICCV48922.2021.01311"},{"key":"11_CR2","doi-asserted-by":"crossref","unstructured":"Cheng, K., Zhang, Y., He, X., Chen, W., Cheng, J., Lu, H.: Skeleton-based action recognition with shift graph convolutional network. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 180\u2013189 (2020)","DOI":"10.1109\/CVPR42600.2020.00026"},{"key":"11_CR3","doi-asserted-by":"crossref","unstructured":"Chi, H.G., Ha, M.H., Chi, S., Lee, S.W., Huang, Q., Ramani, K.: Infogcn: representation learning for human skeleton-based action recognition. In: 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 20154\u201320164 (2022)","DOI":"10.1109\/CVPR52688.2022.01955"},{"issue":"1","key":"11_CR4","doi-asserted-by":"publisher","first-page":"514","DOI":"10.1109\/TPAMI.2024.3466212","volume":"47","author":"S Chi","year":"2025","unstructured":"Chi, S., Chi, H.G., Huang, Q., Ramani, K.: Infogcn++: learning representation by predicting the future for online skeleton-based action recognition. IEEE Trans. Pattern Anal. Mach. Intell. 47(1), 514\u2013528 (2025)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"11_CR5","doi-asserted-by":"crossref","unstructured":"Chollet, F.: Xception: deep learning with depthwise separable convolutions. In: 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1800\u20131807 (2017)","DOI":"10.1109\/CVPR.2017.195"},{"key":"11_CR6","doi-asserted-by":"publisher","first-page":"401","DOI":"10.1007\/978-3-031-72940-9_23","volume-title":"Computer Vision - ECCV 2024","author":"J Do","year":"2025","unstructured":"Do, J., Kim, M.: Skateformer: skeletal-temporal transformer for human action recognition. In: Leonardis, A., Ricci, E., Roth, S., Russakovsky, O., Sattler, T., Varol, G. (eds.) Computer Vision - ECCV 2024, pp. 401\u2013420. Springer Nature Switzerland, Cham (2025)"},{"key":"11_CR7","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"11_CR8","doi-asserted-by":"crossref","unstructured":"Horng, S.J., Huang, P.S.: Building unmanned store identification systems using yolov4 and siamese network. Appli. Sci. 12(8) (2022)","DOI":"10.3390\/app12083826"},{"key":"11_CR9","doi-asserted-by":"crossref","unstructured":"Howard, A., et al.: Searching for mobilenetv3. In: 2019 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 1314\u20131324 (2019)","DOI":"10.1109\/ICCV.2019.00140"},{"key":"11_CR10","unstructured":"Iandola, F.N., Han, S., Moskewicz, M.W., Ashraf, K., Dally, W.J., Keutzer, K.: Squeezenet: Alexnet-level accuracy with 50x fewer parameters and <0.5mb model size (2016)"},{"key":"11_CR11","doi-asserted-by":"crossref","unstructured":"Jose, J.A.C., et al.: Smart shelf system for customer behavior tracking in supermarkets. Sensors 24(2) (2024)","DOI":"10.3390\/s24020367"},{"key":"11_CR12","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2019.113063","volume":"143","author":"DH Kim","year":"2020","unstructured":"Kim, D.H., Lee, S., Jeon, J., Song, B.C.: Real-time purchase behavior recognition system based on deep learning-based object detection and tracking for an unmanned product cabinet. Expert Syst. Appl. 143, 113063 (2020)","journal-title":"Expert Syst. Appl."},{"key":"11_CR13","doi-asserted-by":"crossref","unstructured":"Lee, J., Lee, M., Lee, D., Lee, S.: Hierarchically decomposed graph convolutional networks for skeleton-based action recognition. In: 2023 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 10410\u201310419 (2023)","DOI":"10.1109\/ICCV51070.2023.00958"},{"key":"11_CR14","doi-asserted-by":"crossref","unstructured":"Li, B., et al.: Anomalous behavior detection with spatiotemporal interaction and autoencoder enhancement. Electronics 12(11) (2023)","DOI":"10.3390\/electronics12112438"},{"key":"11_CR15","doi-asserted-by":"publisher","first-page":"21038","DOI":"10.1109\/ACCESS.2024.3361675","volume":"12","author":"J Li","year":"2024","unstructured":"Li, J., Tang, F., Zhu, C., He, S., Zhang, S., Su, Y.: Bp-yolo: a real-time product detection and shopping behaviors recognition model for intelligent unmanned vending machine. IEEE Access 12, 21038\u201321051 (2024)","journal-title":"IEEE Access"},{"issue":"4","key":"11_CR16","doi-asserted-by":"publisher","first-page":"3455","DOI":"10.1109\/JSEN.2022.3140356","volume":"22","author":"MH Lin","year":"2022","unstructured":"Lin, M.H., Sarwar, M.A., Daraghmi, Y.A., \u0130k, T.U.: On-shelf load cell calibration for positioning and weighing assisted by activity detection: smart store scenario. IEEE Sens. J. 22(4), 3455\u20133463 (2022)","journal-title":"IEEE Sens. J."},{"key":"11_CR17","unstructured":"Luo, D., Wang, X.: ModernTCN: a modern pure convolution structure for general time series analysis. In: The Twelfth International Conference on Learning Representations, pp. 1\u201343 (2024)"},{"key":"11_CR18","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"122","DOI":"10.1007\/978-3-030-01264-9_8","volume-title":"Computer Vision \u2013 ECCV 2018","author":"N Ma","year":"2018","unstructured":"Ma, N., Zhang, X., Zheng, H.-T., Sun, J.: ShuffleNet V2: practical guidelines for efficient CNN architecture design. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) Computer Vision \u2013 ECCV 2018. LNCS, vol. 11218, pp. 122\u2013138. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01264-9_8"},{"key":"11_CR19","doi-asserted-by":"crossref","unstructured":"Sandler, M., Howard, A., Zhu, M., Zhmoginov, A., Chen, L.C.: Mobilenetv2: inverted residuals and linear bottlenecks. In: 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4510\u20134520 (2018)","DOI":"10.1109\/CVPR.2018.00474"},{"key":"11_CR20","doi-asserted-by":"crossref","unstructured":"Shaker, A., Maaz, M., Rasheed, H., Khan, S., Yang, M.H., Khan, F.S.: Swiftformer: efficient additive attention for transformer-based real-time mobile vision applications. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 17425\u201317436 (2023)","DOI":"10.1109\/ICCV51070.2023.01598"},{"key":"11_CR21","doi-asserted-by":"crossref","unstructured":"Shi, L., Zhang, Y., Cheng, J., Lu, H.: Two-stream adaptive graph convolutional networks for skeleton-based action recognition. In: 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 12018\u201312027 (2019)","DOI":"10.1109\/CVPR.2019.01230"},{"key":"11_CR22","doi-asserted-by":"crossref","unstructured":"Singh, B., Marks, T.K., Jones, M., Tuzel, O., Shao, M.: A multi-stream bi-directional recurrent neural network for fine-grained action detection. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1961\u20131970 (2016)","DOI":"10.1109\/CVPR.2016.216"},{"key":"11_CR23","unstructured":"Tan, M., Le, Q.: Efficientnet: rethinking model scaling for convolutional neural networks. In: International Conference on Machine Learning, pp. 6105\u20136114. PMLR (2019)"},{"key":"11_CR24","unstructured":"Ultralytics: Pose estimation \u2013 ultralytics yolo docs (2024)"},{"key":"11_CR25","doi-asserted-by":"crossref","unstructured":"Wang, Q., Wu, B., Zhu, P., Li, P., Zuo, W., Hu, Q.: Eca-net: Efficient channel attention for deep convolutional neural networks. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 11531\u201311539 (2020)","DOI":"10.1109\/CVPR42600.2020.01155"},{"key":"11_CR26","doi-asserted-by":"crossref","unstructured":"Wen, J., Abe, T., Suganuma, T.: A customer behavior recognition method for flexibly adapting to target changes in retail stores. Sensors 22(18) (2022)","DOI":"10.3390\/s22186740"},{"key":"11_CR27","doi-asserted-by":"crossref","unstructured":"Yan, S., Xiong, Y., Lin, D.: Spatial temporal graph convolutional networks for skeleton-based action recognition. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 32(1) (Apr 2018)","DOI":"10.1609\/aaai.v32i1.12328"},{"issue":"2","key":"11_CR28","doi-asserted-by":"publisher","first-page":"254","DOI":"10.1109\/TAI.2021.3116227","volume":"3","author":"HB Zhang","year":"2022","unstructured":"Zhang, H.B., Zhou, Y.Z., Dong, L.J., Lei, Q., Du, J.X.: Design and implementation of a vision- and grating-sensor-based intelligent unmanned settlement system. IEEE Trans. Artifi. Intell. 3(2), 254\u2013264 (2022)","journal-title":"IEEE Trans. Artifi. Intell."},{"key":"11_CR29","doi-asserted-by":"crossref","unstructured":"Zheng, Y., Huang, H., Wang, X., Yan, X., Xu, L.: Spatio-temporal fusion for human action recognition via joint trajectory graph. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 38(7), 7579\u20137587 (2024)","DOI":"10.1609\/aaai.v38i7.28590"},{"key":"11_CR30","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Yan, X., Cheng, Z.Q., Yan, Y., Dai, Q., Hua, X.S.: Blockgcn: redefine topology awareness for skeleton-based action recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2049\u20132058 (2024)","DOI":"10.1109\/CVPR52733.2024.00200"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-5676-2_11","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,12]],"date-time":"2026-01-12T20:32:10Z","timestamp":1768249930000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-5676-2_11"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819556755","9789819556762"],"references-count":30,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-5676-2_11","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"13 January 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Shanghai","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 October 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 October 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/2025.prcv.cn\/index.asp","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}