{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,13]],"date-time":"2026-02-13T08:21:21Z","timestamp":1770970881507,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":40,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100014219","name":"National Science Fund for Distinguished Young Scholars","doi-asserted-by":"publisher","award":["62125601"],"award-info":[{"award-number":["62125601"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100014219","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62072032,62076024"],"award-info":[{"award-number":["62072032,62076024"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681560","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"1034-1042","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Unsupervised Multi-view Pedestrian Detection"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9719-8417","authenticated-orcid":false,"given":"Mengyin","family":"Liu","sequence":"first","affiliation":[{"name":"School of Computer and Communication Engineering, University of Science and Technology Beijing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5486-7492","authenticated-orcid":false,"given":"Chao","family":"Zhu","sequence":"additional","affiliation":[{"name":"School of Computer and Communication Engineering, University of Science and Technology Beijing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1861-6922","authenticated-orcid":false,"given":"Shiqi","family":"Ren","sequence":"additional","affiliation":[{"name":"School of Computer and Communication Engineering, University of Science and Technology Beijing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0023-0220","authenticated-orcid":false,"given":"Xu-Cheng","family":"Yin","sequence":"additional","affiliation":[{"name":"School of Computer and Communication Engineering, University of Science and Technology Beijing, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.283"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.38"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00528"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICMLA.2017.00-50"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00021"},{"key":"e_1_3_2_1_8_1","volume-title":"Multicamera people tracking with a probabilistic occupancy map","author":"Fleuret Francois","year":"2007","unstructured":"Francois Fleuret, Jerome Berclaz, Richard Lengagne, and Pascal Fua. 2007. Multicamera people tracking with a probabilistic occupancy map. IEEE transactions on pattern analysis and machine intelligence, Vol. 30, 2 (2007), 267--282."},{"key":"e_1_3_2_1_9_1","volume-title":"Video Surveillance and Transportation Imaging Applications","author":"Haritaoglu Ismail","unstructured":"Ismail Haritaoglu, Myron Flickner, and David Beymer. 2013. Video-CRM: understanding customer behaviors in stores. In Video Surveillance and Transportation Imaging Applications, Vol. 8663. SPIE, 266--271."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475310"},{"key":"e_1_3_2_1_12_1","volume-title":"Proceedings, Part VII 16","author":"Hou Yunzhong","year":"2020","unstructured":"Yunzhong Hou, Liang Zheng, and Stephen Gould. 2020. Multiview detection with feature perspective transformation. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part VII 16. Springer, 1--18."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4612-4380-9_35"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00781"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Justin Johnson Nikhila Ravi Jeremy Reizenstein David Novotny Shubham Tulsiani Christoph Lassner and Steve Branson. 2020. Accelerating 3d deep learning with pytorch3d. In SIGGRAPH Asia 2020 Courses. 1--1.","DOI":"10.1145\/3415263.3419160"},{"key":"e_1_3_2_1_16_1","volume-title":"Framework for performance evaluation of face, text, and vehicle detection and tracking in video: Data, metrics, and protocol","author":"Kasturi Rangachar","year":"2008","unstructured":"Rangachar Kasturi, Dmitry Goldgof, Padmanabhan Soundararajan, Vasant Manohar, John Garofolo, Rachel Bowers, Matthew Boonstra, Valentina Korzhova, and Jing Zhang. 2008. Framework for performance evaluation of face, text, and vehicle detection and tracking in video: Data, metrics, and protocol. IEEE transactions on pattern analysis and machine intelligence, Vol. 31, 2 (2008), 319--336."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01807"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548352"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00283"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00644"},{"key":"e_1_3_2_1_22_1","unstructured":"Shilong Liu Zhaoyang Zeng Tianhe Ren Feng Li Hao Zhang Jie Yang Chunyuan Li Jianwei Yang Hang Su Jun Zhu et al. 2023. Grounding dino: Marrying dino with grounded pre-training for open-set object detection. arXiv preprint arXiv:2303.05499 (2023)."},{"key":"e_1_3_2_1_23_1","first-page":"17730","article-title":"Unsupervised multi-view object segmentation using radiance field propagation","volume":"35","author":"Liu Xinhang","year":"2022","unstructured":"Xinhang Liu, Jiaben Chen, Huai Yu, Yu-Wing Tai, and Chi-Keung Tang. 2022. Unsupervised multi-view object segmentation using radiance field propagation. Advances in Neural Information Processing Systems, Vol. 35 (2022), 17730--17743.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_24_1","volume-title":"Proceedings, Part I 16","author":"Mildenhall Ben","year":"2020","unstructured":"Ben Mildenhall, Pratul P Srinivasan, Matthew Tancik, Jonathan T Barron, Ravi Ramamoorthi, and Ren Ng. 2020. NeRF: Representing Scenes as Neural Radiance Fields for View Synthesis. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part I 16. Springer, 405--421."},{"key":"e_1_3_2_1_25_1","unstructured":"Maxime Oquab Timoth\u00e9e Darcet Th\u00e9o Moutakanni Huy Vo Marc Szafraniec Vasil Khalidov Pierre Fernandez Daniel Haziza Francisco Massa Alaaeldin El-Nouby et al. 2023. DINOv2: Learning Robust Visual Features without Supervision. arXiv preprint arXiv:2304.07193 (2023)."},{"key":"e_1_3_2_1_26_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. 2019. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_27_1","volume-title":"Proceedings, Part X. Springer, 695--710","author":"Qiu Rui","year":"2022","unstructured":"Rui Qiu, Ming Xu, Yuyao Yan, Jeremy S Smith, and Xi Yang. 2022. 3D Random Occlusion and Multi-layer Projection for Deep Multi-camera Pedestrian Localization. In Computer Vision--ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23--27, 2022, Proceedings, Part X. Springer, 695--710."},{"key":"e_1_3_2_1_28_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_29_1","unstructured":"Tianhe Ren Shilong Liu Ailing Zeng Jing Lin Kunchang Li He Cao Jiayu Chen Xinyu Huang Yukang Chen Feng Yan et al. 2024. Grounded sam: Assembling open-world models for diverse visual tasks. arXiv preprint arXiv:2401.14159 (2024)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00133"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00599"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00644"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACVW58289.2023.00016"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00305"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00349"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.461"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00849"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i7.28553"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00062"},{"key":"e_1_3_2_1_40_1","volume-title":"Proceedings, Part XXVIII. Springer, 696--712","author":"Zhou Chong","year":"2022","unstructured":"Chong Zhou, Chen Change Loy, and Bo Dai. 2022. Extract free dense labels from clip. In Computer Vision--ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23--27, 2022, Proceedings, Part XXVIII. Springer, 696--712."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681560","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681560","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:48Z","timestamp":1750295868000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681560"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":40,"alternative-id":["10.1145\/3664647.3681560","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681560","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}