{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,8]],"date-time":"2026-04-08T16:27:32Z","timestamp":1775665652975,"version":"3.50.1"},"reference-count":82,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"6","license":[{"start":{"date-parts":[[2025,6,1]],"date-time":"2025-06-01T00:00:00Z","timestamp":1748736000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,6,1]],"date-time":"2025-06-01T00:00:00Z","timestamp":1748736000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,6,1]],"date-time":"2025-06-01T00:00:00Z","timestamp":1748736000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62225604"],"award-info":[{"award-number":["62225604"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62276145"],"award-info":[{"award-number":["62276145"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["070-63223049"],"award-info":[{"award-number":["070-63223049"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Supercomputing Center of Nankai University"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2025,6]]},"DOI":"10.1109\/tpami.2025.3538473","type":"journal-article","created":{"date-parts":[[2025,2,4]],"date-time":"2025-02-04T18:35:08Z","timestamp":1738694108000},"page":"4240-4252","source":"Crossref","is-referenced-by-count":139,"title":["YOLO-MS: Rethinking Multi-Scale Representation Learning for Real-Time Object Detection"],"prefix":"10.1109","volume":"47","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-6357-8375","authenticated-orcid":false,"given":"Yuming","family":"Chen","sequence":"first","affiliation":[{"name":"VCIP, School of Computer Science, Nankai University, Tianjin, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8715-5497","authenticated-orcid":false,"given":"Xinbin","family":"Yuan","sequence":"additional","affiliation":[{"name":"VCIP, School of Computer Science, Nankai University, Tianjin, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4951-3088","authenticated-orcid":false,"given":"Jiabao","family":"Wang","sequence":"additional","affiliation":[{"name":"VCIP, School of Computer Science, Nankai University, Tianjin, China"}]},{"given":"Ruiqi","family":"Wu","sequence":"additional","affiliation":[{"name":"VCIP, School of Computer Science, Nankai University, Tianjin, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4996-7365","authenticated-orcid":false,"given":"Xiang","family":"Li","sequence":"additional","affiliation":[{"name":"VCIP, School of Computer Science, Nankai University, Tianjin, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8388-8708","authenticated-orcid":false,"given":"Qibin","family":"Hou","sequence":"additional","affiliation":[{"name":"VCIP, School of Computer Science, Nankai University, Tianjin, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5550-8758","authenticated-orcid":false,"given":"Ming-Ming","family":"Cheng","sequence":"additional","affiliation":[{"name":"VCIP, School of Computer Science, Nankai University, Tianjin, China"}]}],"member":"263","reference":[{"key":"ref1","article-title":"YOLOV4: Optimal speed and accuracy of object detection","author":"Bochkovskiy","year":"2020"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00644"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00511"},{"key":"ref5","article-title":"MMDetection: Open MMlab detection toolbox and benchmark","author":"Chen","year":"2019"},{"key":"ref6","article-title":"Rethinking atrous convolution for semantic image segmentation","author":"Chen","year":"2017"},{"key":"ref7","first-page":"379","article-title":"R-FCN: Object detection via region-based fully convolutional networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Dai"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52688.2022.01166"},{"key":"ref10","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Dosovitskiy"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2017.12.012"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00349"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.10.039"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2938758"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3183829"},{"key":"ref16","article-title":"YOLOx: Exceeding YOLO series in 2021,","author":"Ge","year":"2021"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00294"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.169"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.81"},{"key":"ref20","first-page":"1140","article-title":"Segnext: Rethinking convolutional attention design for semantic segmentation","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Guo"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/s41095-023-0364-2"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1007\/s41095-022-0271-y"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3152247"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2015.2389824"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3401450"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.5555\/3045118.3045167"},{"key":"ref28","article-title":"YOLOV5","author":"Jocher","year":"2020"},{"key":"ref29","article-title":"YOLOV8","author":"Jocher","year":"2023"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/3505244"},{"key":"ref31","article-title":"Adam: A method for stochastic optimization","author":"Kingma","year":"2017"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1007\/s00591-010-0080-8"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2867951"},{"key":"ref34","article-title":"YOLOv6: A single-stage object detection framework for industrial applications","author":"Li","year":"2022"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01325"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01146"},{"key":"ref37","first-page":"21002","article-title":"Generalized focal loss: Learning qualified and distributed bounding boxes for dense object detection","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Li"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01540"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-26348-4_19"},{"key":"ref40","article-title":"Towards raw object detection in diverse conditions","author":"Li","year":"2024"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.106"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2858826"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref44","article-title":"DAB-DETR: Dynamic anchor boxes are better queries for DETR","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Liu"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00913"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"ref47","first-page":"4898","article-title":"Understanding the effective receptive field in deep convolutional neural networks","volume-title":"Proc. Adv. Neural Inform. Process. Syst.","author":"Luo"},{"key":"ref48","article-title":"RTMDet: An empirical study of designing real-time object detectors","author":"Lyu","year":"2022"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-25082-8_1"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00363"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00091"},{"key":"ref52","first-page":"8026","article-title":"PyTorch: An imperative style, high-performance deep learning library","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Paszke"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.91"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.690"},{"key":"ref55","article-title":"YOLOv3: An incremental improvement","author":"Redmon","year":"2018"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2016.2577031"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00075"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00474"},{"key":"ref59","first-page":"618","article-title":"Grad-CAM: Visual explanations from deep networks via gradient-based localization","volume-title":"Proc. Int. Conf. Comput. Vis.","author":"Ramprasaath"},{"key":"ref60","article-title":"Crowdhuman: A benchmark for detecting human in a crowd","author":"Shao","year":"2018"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00972"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref63","first-page":"107984","article-title":"YOLOv10: Real-time end-to-end object detection","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Wang"},{"key":"ref64","first-page":"51094","article-title":"Gold-YOLO: Efficient object detector via gather-and-distribute mechanism","volume-title":"Adv. Neural Inf. Process. Syst.","author":"Wang","year":"2023"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01283"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00721"},{"key":"ref67","article-title":"Designing network design strategies through gradient path analysis","author":"Wang","year":"2022"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00203"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72751-1_1"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00308"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1984.1172729"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00418"},{"key":"ref73","article-title":"PP-YOLOE: An evolved version of YOLO","author":"Xu","year":"2022"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3438565"},{"key":"ref75","article-title":"DINO: DETR with improved denoising anchor boxes for end-to-end object detection","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Zhang"},{"key":"ref76","article-title":"Mixup: Beyond empirical risk minimization","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Zhang"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00978"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01521"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2025.3532440"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01605"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1911.08287"},{"key":"ref82","article-title":"Deformable DETR: Deformable transformers for end-to-end object detection","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Zhu"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/34\/10990047\/10872821.pdf?arnumber=10872821","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,8]],"date-time":"2025-05-08T17:38:29Z","timestamp":1746725909000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10872821\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6]]},"references-count":82,"journal-issue":{"issue":"6"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2025.3538473","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,6]]}}}