{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,19]],"date-time":"2026-02-19T15:25:18Z","timestamp":1771514718099,"version":"3.50.1"},"publisher-location":"Cham","reference-count":55,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031198113","type":"print"},{"value":"9783031198120","type":"electronic"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-19812-0_26","type":"book-chapter","created":{"date-parts":[[2022,10,29]],"date-time":"2022-10-29T14:03:42Z","timestamp":1667052222000},"page":"444-460","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":25,"title":["Towards Hard-Positive Query Mining for\u00a0DETR-Based Human-Object Interaction Detection"],"prefix":"10.1007","author":[{"given":"Xubin","family":"Zhong","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Changxing","family":"Ding","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zijian","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shaoli","family":"Huang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2022,10,30]]},"reference":[{"key":"26_CR1","unstructured":"Gupta, S., Malik, J.: Visual semantic role labeling. arXiv preprint arXiv:1505.04474 (2015)"},{"key":"26_CR2","doi-asserted-by":"crossref","unstructured":"Chao, Y., Liu, Y., Liu, X., Zeng, H., Deng, J.: Learning to detect human-object interactions. In: WACV (2018)","DOI":"10.1109\/WACV.2018.00048"},{"key":"26_CR3","doi-asserted-by":"crossref","unstructured":"Ji, J., Krishna, R., Fei-Fei, L., Niebles, J.: Action genome: Actions as compositions of spatio-temporal scene graphs. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01025"},{"key":"26_CR4","doi-asserted-by":"crossref","unstructured":"Tamura, M., Ohashi, H., Yoshinaga, T.: QPIC: query-based pairwise human-object interaction detection with image-wide contextual information. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.01027"},{"key":"26_CR5","doi-asserted-by":"crossref","unstructured":"Kim, B., Lee, J., Kang, J., Kim, E., Kim, H.: HOTR: end-to-end human-object interaction detection with transformers. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00014"},{"key":"26_CR6","doi-asserted-by":"crossref","unstructured":"Zou, C., et al.: End-to-end human object interaction detection with hoi transformer. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.01165"},{"key":"26_CR7","unstructured":"Zhang, A., et al.: Mining the Benefits of Two-stage and One-stage HOI Detection. In: NeurIPS (2021)"},{"key":"26_CR8","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"26_CR9","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: Towards real-time object detection with region proposal networks. In: NeurIPS (2015)"},{"key":"26_CR10","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: Transferable Interactiveness Knowledge for Human-Object Interaction Detection. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00370"},{"key":"26_CR11","doi-asserted-by":"crossref","unstructured":"Gupta, T., Schwing, A., Hoiem, D.: No-frills human-object interaction detection: factorization, layout encodings, and training techniques. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00977"},{"key":"26_CR12","doi-asserted-by":"crossref","unstructured":"Wang, T., Yang, T., Danelljan, M., Khan, F., Zhang, X., Sun, J.: Learning human-object interaction detection using interaction points. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00417"},{"key":"26_CR13","doi-asserted-by":"crossref","unstructured":"Liao, Y., Liu, S., Wang, F., Chen, Y., Qian, C., Feng, J.: Ppdm: Parallel point detection and matching for real-time human-object interaction detection. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00056"},{"key":"26_CR14","doi-asserted-by":"crossref","unstructured":"Ulutan, O., Iftekhar, A., Manjunath, B.: VSGNet: Spatial attention network for detecting human object interactions using graph convolutions. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01363"},{"key":"26_CR15","doi-asserted-by":"crossref","unstructured":"Li, Y.: Detailed 2D\u20133D joint representation for human-object interaction. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01018"},{"key":"26_CR16","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"69","DOI":"10.1007\/978-3-030-58565-5_5","volume-title":"Computer Vision \u2013 ECCV 2020","author":"X Zhong","year":"2020","unstructured":"Zhong, X., Ding, C., Qu, X., Tao, D.: Polysemy deciphering network for human-object interaction detection. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12365, pp. 69\u201385. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58565-5_5"},{"key":"26_CR17","doi-asserted-by":"crossref","unstructured":"Zhong, X., Ding, C., Qu, X., Tao, D.: Polysemy deciphering network for robust human-object interaction detection. In: IJCV (2021)","DOI":"10.1007\/s11263-021-01458-8"},{"key":"26_CR18","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"696","DOI":"10.1007\/978-3-030-58610-2_41","volume-title":"Computer Vision \u2013 ECCV 2020","author":"C Gao","year":"2020","unstructured":"Gao, C., Xu, J., Zou, Y., Huang, J.-B.: DRG: Dual relation graph for human-object interaction detection. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12357, pp. 696\u2013712. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58610-2_41"},{"key":"26_CR19","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"584","DOI":"10.1007\/978-3-030-58555-6_35","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Z Hou","year":"2020","unstructured":"Hou, Z., Peng, X., Qiao, Yu., Tao, D.: Visual compositional learning for human-object interaction detection. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12360, pp. 584\u2013600. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58555-6_35"},{"key":"26_CR20","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"718","DOI":"10.1007\/978-3-030-58589-1_43","volume-title":"Computer Vision \u2013 ECCV 2020","author":"D-J Kim","year":"2020","unstructured":"Kim, D.-J., Sun, X., Choi, J., Lin, S., Kweon, I.S.: Detecting human-object interactions with action co-occurrence priors. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12366, pp. 718\u2013736. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58589-1_43"},{"key":"26_CR21","doi-asserted-by":"crossref","unstructured":"Zhou, P., Chi, M.: Relation parsing neural network for human-object interaction detection. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00093"},{"key":"26_CR22","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"248","DOI":"10.1007\/978-3-030-58568-6_15","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Y Liu","year":"2020","unstructured":"Liu, Y., Chen, Q., Zisserman, A.: Amplifying key cues for human-object-interaction detection. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12359, pp. 248\u2013265. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58568-6_15"},{"key":"26_CR23","doi-asserted-by":"crossref","unstructured":"Liu, Y., Yuan, J., Chen, C.: ConsNet: learning consistency graph for zero-shot human-object interaction detection. In: ACM MM (2020)","DOI":"10.1145\/3394171.3413600"},{"key":"26_CR24","doi-asserted-by":"crossref","unstructured":"Wan, B., Zhou, D., Liu, Y., Li, R., He, X.: Pose-aware Multi-level Feature Network for Human Object Interaction Detection. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00956"},{"key":"26_CR25","unstructured":"Gao, C., Zou, Y., Huang, J.: ican: Instance-centric attention network for human-object interaction detection. In: BMVC (2018)"},{"key":"26_CR26","doi-asserted-by":"crossref","unstructured":"Wang, T., et al.: Deep contextual attention for human-object interaction detection. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00579"},{"key":"26_CR27","doi-asserted-by":"crossref","unstructured":"Gkioxari, G., Girshick, R.: Detecting and recognizing human-object interactions. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00872"},{"key":"26_CR28","doi-asserted-by":"crossref","unstructured":"Zhong, X., Qu, X., Ding, C., Tao, D.: Glance and gaze: inferring action-aware points for one-stage human-object interaction detection. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.01303"},{"key":"26_CR29","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"498","DOI":"10.1007\/978-3-030-58555-6_30","volume-title":"Computer Vision \u2013 ECCV 2020","author":"B Kim","year":"2020","unstructured":"Kim, B., Choi, T., Kang, J., Kim, H.J.: Uniondet: Union-level detector towards real-time human-object interaction detection. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12360, pp. 498\u2013514. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58555-6_30"},{"key":"26_CR30","doi-asserted-by":"crossref","unstructured":"Chen, M., Liao, Y., Liu, S., Chen, Z., Wang, F., Qian, C.: Reformulating hoi detection as adaptive set prediction. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00889"},{"key":"26_CR31","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"26_CR32","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"26_CR33","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.: Attention is all you need. In: NeurIPS (2017)"},{"key":"26_CR34","first-page":"1929","volume":"15","author":"N Srivastava","year":"2014","unstructured":"Srivastava, N., Hinton, G., Krizhevsky, A., Sutskever, I., Salakhutdinov, R.: Dropout: a simple way to prevent neural networks from overfitting. J. Mach. Learn. Res. 15, 1929\u20131958 (2014)","journal-title":"J. Mach. Learn. Res."},{"key":"26_CR35","unstructured":"Kuhn, H.: The Hungarian method for the assignment problem. In: Naval Research Logistics Quarterly (2020)"},{"key":"26_CR36","unstructured":"Ghiasi, G., Lin, T., Le, Q.: Dropblock: A regularization method for convolutional networks. In: Wiley Online Library (1955)"},{"key":"26_CR37","doi-asserted-by":"crossref","unstructured":"Zhou, T., Wang, W., Qi, S., Ling, H., Shen, J.: Cascaded human-object interaction recognition. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00432"},{"key":"26_CR38","unstructured":"Pic leaderboard (2019). http:\/\/www.picdataset.com\/challenge\/leaderboard\/hoi2019,"},{"key":"26_CR39","doi-asserted-by":"crossref","unstructured":"Meng, D.: Conditional DETR for fast training convergence. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00363"},{"key":"26_CR40","doi-asserted-by":"crossref","unstructured":"Gao, P., Zheng, M., Wang, X., Dai, J., Li, H.: Fast convergence of DETR with spatially modulated CoAttention. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00360"},{"key":"26_CR41","doi-asserted-by":"crossref","unstructured":"Dai, X., Chen, Y., Yang, J., Zhang, P., Yuan, L., Zhang, L.: Dynamic DETR: end-to-end object detection with dynamic attention. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00298"},{"key":"26_CR42","unstructured":"Zhu, X., Su, W., Lu, L., Li, B., Wang, X., Dai, J.: Deformable DETR: deformable transformers for end- to-end object detection. In: ICLR (2020)"},{"key":"26_CR43","unstructured":"Liu, S., et al.: DAB-DETR: dynamic anchor boxes are better queries for DETR. In: ICLR (2022)"},{"key":"26_CR44","doi-asserted-by":"crossref","unstructured":"Yuan, H., Wang, M., Ni, D., Xu, L.: Detecting human-object interactions with object-guided cross-modal calibrated semantics. In: AAAI (2022)","DOI":"10.1609\/aaai.v36i3.20229"},{"key":"26_CR45","doi-asserted-by":"crossref","unstructured":"Li, Z., Zou, C., Zhao, Y., Li, B., Zhong, S.: Improving human-object interaction detection via phrase learning and label composition. In: AAAI (2022)","DOI":"10.1609\/aaai.v36i2.20041"},{"key":"26_CR46","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: ICLR (2018)"},{"key":"26_CR47","doi-asserted-by":"crossref","unstructured":"Rezatofighi, H., Tsoi, N., Gwak, J., Sadeghian, A., Reid, I., Savarese, S.: Generalized intersection over union: A metric and a loss for bounding box regression. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00075"},{"key":"26_CR48","doi-asserted-by":"crossref","unstructured":"Lin, T., Goyal, P., Girshick, R., He, K., Dollar, P.: Focal loss for dense object detection. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.324"},{"key":"26_CR49","unstructured":"Wang, X., Shrivastava, A., Gupta, A.: A-fast-rcnn: Hard positive generation via adversary for object detection. arXiv preprint arXiv:2201.12329 (2022)"},{"key":"26_CR50","doi-asserted-by":"crossref","unstructured":"Shrivastava, A., Gupta, A., Girshick, R.: Training region-based object detectors with online hard example mining. In: CVPR (2017)","DOI":"10.1109\/CVPR.2016.89"},{"key":"26_CR51","doi-asserted-by":"crossref","unstructured":"Wang, K., Wang, P., Ding, C., Tao, D.: Batch coherence-driven network for part-aware person re-identification. In: TIP (2021)","DOI":"10.1109\/TIP.2021.3060909"},{"key":"26_CR52","doi-asserted-by":"crossref","unstructured":"Qu, X., Ding, C., Li, X., Zhong, X., Tao, D.: Distillation using oracle queries for transformer-based human-object interaction detection. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01895"},{"key":"26_CR53","doi-asserted-by":"crossref","unstructured":"Lin, X., Ding, C., Zhang, J., Zhan, Y., Tao, D.: RU-Net: regularized unrolling network for scene graph generation. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01885"},{"key":"26_CR54","doi-asserted-by":"crossref","unstructured":"Lin, X., Ding, C., Zhan, Y., Li, Z., Tao, D.: HL-Net: Heterophily learning network for scene graph generation. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01887"},{"key":"26_CR55","doi-asserted-by":"crossref","unstructured":"Li, F., Zhang, H., Liu, S., Guo, J., Ni, L., Zhang, L.: Dn-detr: Accelerate detr training by introducing query denoising. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01325"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-19812-0_26","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,4,27]],"date-time":"2023-04-27T13:08:01Z","timestamp":1682600881000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-19812-0_26"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031198113","9783031198120"],"references-count":55,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-19812-0_26","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"30 October 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"From the workshops, 367 reviewed full papers have been selected for publication","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}