{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,25]],"date-time":"2025-03-25T19:45:50Z","timestamp":1742931950218,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":27,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819620630"},{"type":"electronic","value":"9789819620647"}],"license":[{"start":{"date-parts":[[2024,12,28]],"date-time":"2024-12-28T00:00:00Z","timestamp":1735344000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,28]],"date-time":"2024-12-28T00:00:00Z","timestamp":1735344000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-96-2064-7_3","type":"book-chapter","created":{"date-parts":[[2024,12,27]],"date-time":"2024-12-27T19:24:20Z","timestamp":1735327460000},"page":"31-44","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["MS-SAM: Multi-scale SAM Based on\u00a0Dynamic Weighted Agent Attention"],"prefix":"10.1007","author":[{"given":"Enhui","family":"Yang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhibin","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,12,28]]},"reference":[{"doi-asserted-by":"crossref","unstructured":"Bolya, D., Fu, C.Y., Dai, X., Zhang, P., Hoffman, J.: Hydra attention: efficient attention with many heads. In: European Conference on Computer Vision, pp. 35\u201349. Springer, Cham (2022)","key":"3_CR1","DOI":"10.1007\/978-3-031-25082-8_3"},{"doi-asserted-by":"crossref","unstructured":"Chen, Y., et al.: Mobile-former: bridging mobilenet and transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5270\u20135279 (2022)","key":"3_CR2","DOI":"10.1109\/CVPR52688.2022.00520"},{"unstructured":"Choromanski, K., et\u00a0al.: Rethinking attention with performers. arXiv preprint arXiv:2009.14794 (2020)","key":"3_CR3"},{"unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16x16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)","key":"3_CR4"},{"doi-asserted-by":"crossref","unstructured":"Fan, H., et al.: Multiscale vision transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6824\u20136835 (2021)","key":"3_CR5","DOI":"10.1109\/ICCV48922.2021.00675"},{"doi-asserted-by":"crossref","unstructured":"Gu, J., et al.: Multi-scale high-resolution vision transformer for semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12094\u201312103 (2022)","key":"3_CR6","DOI":"10.1109\/CVPR52688.2022.01178"},{"doi-asserted-by":"crossref","unstructured":"Guo, J., et al.: CMT: convolutional neural networks meet vision transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12175\u201312185 (2022)","key":"3_CR7","DOI":"10.1109\/CVPR52688.2022.01186"},{"doi-asserted-by":"crossref","unstructured":"Gupta, A., Dollar, P., Girshick, R.: Lvis: a dataset for large vocabulary instance segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5356\u20135364 (2019)","key":"3_CR8","DOI":"10.1109\/CVPR.2019.00550"},{"doi-asserted-by":"crossref","unstructured":"Han, D., Ye, T., Han, Y., Xia, Z., Song, S., Huang, G.: Agent attention: on the integration of softmax and linear attention. arXiv preprint arXiv:2312.08874 (2023)","key":"3_CR9","DOI":"10.1007\/978-3-031-72973-7_8"},{"unstructured":"Katharopoulos, A., Vyas, A., Pappas, N., Fleuret, F.: Transformers are RNNs: fast autoregressive transformers with linear attention. In: International Conference on Machine Learning, pp. 5156\u20135165. PMLR (2020)","key":"3_CR10"},{"unstructured":"Ke, L., et\u00a0al.: Segment anything in high quality. In: Advances in Neural Information Processing Systems, vol. 36 (2024)","key":"3_CR11"},{"doi-asserted-by":"crossref","unstructured":"Kirillov, A., et\u00a0al.: Segment anything. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4015\u20134026 (2023)","key":"3_CR12","DOI":"10.1109\/ICCV51070.2023.00371"},{"doi-asserted-by":"crossref","unstructured":"Li, Y., Mao, H., Girshick, R., He, K.: Exploring plain vision transformer backbones for object detection. In: European Conference on Computer Vision, pp. 280\u2013296. Springer, Cham (2022)","key":"3_CR13","DOI":"10.1007\/978-3-031-20077-9_17"},{"doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: Mvitv2: improved multiscale vision transformers for classification and detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4804\u20134814 (2022)","key":"3_CR14","DOI":"10.1109\/CVPR52688.2022.00476"},{"doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Doll\u00e1r, P., Girshick, R., He, K., Hariharan, B., Belongie, S.: Feature pyramid networks for object detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2117\u20132125 (2017)","key":"3_CR15","DOI":"10.1109\/CVPR.2017.106"},{"key":"3_CR16","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"unstructured":"Mehta, S., Rastegari, M.: Mobilevit: light-weight, general-purpose, and mobile-friendly vision transformer. arXiv preprint arXiv:2110.02178 (2021)","key":"3_CR17"},{"doi-asserted-by":"crossref","unstructured":"Pan, X., et al.: On the integration of self-attention and convolution. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 815\u2013825 (2022)","key":"3_CR18","DOI":"10.1109\/CVPR52688.2022.00089"},{"key":"3_CR19","first-page":"8026","volume":"32","author":"A Paszke","year":"1912","unstructured":"Paszke, A., et al.: An imperative style, high-performance deep learning library. Adv. Neural. Inf. Process. Syst. 32, 8026 (1912)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"unstructured":"Paszke, A., et\u00a0al.: Pytorch: an imperative style, high-performance deep learning library. In: Advances in Neural Information Processing Systems, vol. 32 (2019)","key":"3_CR20"},{"doi-asserted-by":"crossref","unstructured":"Varghese, R., Sambath, M.: Yolov8: a novel object detection algorithm with enhanced performance and robustness. In: 2024 International Conference on Advances in Data Engineering and Intelligent Computing Systems (ADICS), pp.\u00a01\u20136. IEEE (2024)","key":"3_CR21","DOI":"10.1109\/ADICS58448.2024.10533619"},{"unstructured":"Vaswani, A.: Attention is all you need. arXiv preprint arXiv:1706.03762 (2017)","key":"3_CR22"},{"doi-asserted-by":"crossref","unstructured":"Xiong, Y., et\u00a0al.: Efficientsam: leveraged masked image pretraining for efficient segment anything. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16111\u201316121 (2024)","key":"3_CR23","DOI":"10.1109\/CVPR52733.2024.01525"},{"unstructured":"Zhang, C., et al.: Faster segment anything: towards lightweight SAM for mobile applications. arXiv preprint arXiv:2306.14289 (2023)","key":"3_CR24"},{"doi-asserted-by":"crossref","unstructured":"Zhang, Z., Cai, H., Han, S.: Efficientvit-sam: accelerated segment anything model without performance loss. arXiv preprint arXiv:2402.05008 (2024)","key":"3_CR25","DOI":"10.1109\/CVPRW63382.2024.00782"},{"unstructured":"Zhao, X., et al.: Fast segment anything. arXiv preprint arXiv:2306.12156 (2023)","key":"3_CR26"},{"unstructured":"Zhou, C., Li, X., Loy, C.C., Dai, B.: Edgesam: prompt-in-the-loop distillation for on-device deployment of SAM. arXiv preprint arXiv:2312.06660 (2023)","key":"3_CR27"}],"container-title":["Lecture Notes in Computer Science","MultiMedia Modeling"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-2064-7_3","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,27]],"date-time":"2024-12-27T20:02:20Z","timestamp":1735329740000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-2064-7_3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,28]]},"ISBN":["9789819620630","9789819620647"],"references-count":27,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-2064-7_3","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,12,28]]},"assertion":[{"value":"28 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"MMM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Multimedia Modeling","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Nara","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Japan","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"9 January 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 January 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"31","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"mmm2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/mmm2025.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}