{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:16:10Z","timestamp":1750220170662,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":36,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,6,12]],"date-time":"2022-06-12T00:00:00Z","timestamp":1654992000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100019690","name":"Toyota Collaborative Safety Research Center","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100019690","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["2145565"],"award-info":[{"award-number":["2145565"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,6,12]]},"DOI":"10.1145\/3546930.3547499","type":"proceedings-article","created":{"date-parts":[[2022,8,17]],"date-time":"2022-08-17T23:14:49Z","timestamp":1660778089000},"page":"1-7","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Flexible and scalable annotation tool to develop scene understanding datasets"],"prefix":"10.1145","author":[{"given":"Md Fazle","family":"Elahi","sequence":"first","affiliation":[{"name":"Indiana University Purdue University"}]},{"given":"Renran","family":"Tian","sequence":"additional","affiliation":[{"name":"Indiana University Purdue University"}]},{"given":"Xiao","family":"Luo","sequence":"additional","affiliation":[{"name":"Indiana University Purdue University"}]}],"member":"320","published-online":{"date-parts":[[2022,8,17]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2014.06.015"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/AVSS.2016.7738055"},{"key":"e_1_3_2_1_4_1","volume-title":"YOLOv4: Optimal Speed and Accuracy of Object Detection. arXiv:2004.10934 [cs, eess] (April","author":"Bochkovskiy Alexey","year":"2020","unstructured":"Alexey Bochkovskiy , Chien-Yao Wang , and Hong-Yuan Mark Liao . 2020. YOLOv4: Optimal Speed and Accuracy of Object Detection. arXiv:2004.10934 [cs, eess] (April 2020 ). arXiv:2004.10934 [cs, eess] http:\/\/arxiv.org\/abs\/2004.10934 Alexey Bochkovskiy, Chien-Yao Wang, and Hong-Yuan Mark Liao. 2020. YOLOv4: Optimal Speed and Accuracy of Object Detection. arXiv:2004.10934 [cs, eess] (April 2020). arXiv:2004.10934 [cs, eess] http:\/\/arxiv.org\/abs\/2004.10934"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.340"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/2701413"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350535"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1073\/pnas.1422953112"},{"key":"e_1_3_2_1_10_1","unstructured":"Intel. 2022. Computer Vision Annotation Tool (CVAT). https:\/\/github.com\/openvinotoolkit\/cvat  Intel. 2022. Computer Vision Annotation Tool (CVAT). https:\/\/github.com\/openvinotoolkit\/cvat"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1086"},{"key":"e_1_3_2_1_12_1","volume-title":"DeepStory: Video Story QA by Deep Embedded Memory Networks. arXiv:1707.00836 [cs] (July","author":"Kim Kyung-Min","year":"2017","unstructured":"Kyung-Min Kim , Min-Oh Heo , Seong-Ho Choi , and Byoung-Tak Zhang . 2017. DeepStory: Video Story QA by Deep Embedded Memory Networks. arXiv:1707.00836 [cs] (July 2017 ). arXiv:1707.00836 [cs] http:\/\/arxiv.org\/abs\/1707.00836 Kyung-Min Kim, Min-Oh Heo, Seong-Ho Choi, and Byoung-Tak Zhang. 2017. DeepStory: Video Story QA by Deep Embedded Memory Networks. arXiv:1707.00836 [cs] (July 2017). arXiv:1707.00836 [cs] http:\/\/arxiv.org\/abs\/1707.00836"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.455"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0981-7"},{"key":"e_1_3_2_1_15_1","volume-title":"Berg","author":"Lei Jie","year":"2019","unstructured":"Jie Lei , Licheng Yu , Mohit Bansal , and Tamara L . Berg . 2019 . TVQA : Localized, Compositional Video Question Answering . arXiv:1809.01696 [cs] (May 2019). arXiv:1809.01696 [cs] http:\/\/arxiv.org\/abs\/1809.01696 Jie Lei, Licheng Yu, Mohit Bansal, and Tamara L. Berg. 2019. TVQA: Localized, Compositional Video Question Answering. arXiv:1809.01696 [cs] (May 2019). arXiv:1809.01696 [cs] http:\/\/arxiv.org\/abs\/1809.01696"},{"key":"e_1_3_2_1_16_1","volume-title":"Spatio-Temporal Grounding for Video Question Answering. arXiv:1904.11574 [cs] (May","author":"Lei Jie","year":"2020","unstructured":"Jie Lei , Licheng Yu , Tamara L. Berg , and Mohit Bansal . 2020. TVQA+ : Spatio-Temporal Grounding for Video Question Answering. arXiv:1904.11574 [cs] (May 2020 ). arXiv:1904.11574 [cs] http:\/\/arxiv.org\/abs\/1904.11574 Jie Lei, Licheng Yu, Tamara L. Berg, and Mohit Bansal. 2020. TVQA+: Spatio-Temporal Grounding for Video Question Answering. arXiv:1904.11574 [cs] (May 2020). arXiv:1904.11574 [cs] http:\/\/arxiv.org\/abs\/1904.11574"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_18_1","unstructured":"Chenxi Liu Junhua Mao Fei Sha and Alan Yuille. 2017. Attention Correctness in Neural Image Captioning. (2017) 7.  Chenxi Liu Junhua Mao Fei Sha and Alan Yuille. 2017. Attention Correctness in Neural Image Captioning. (2017) 7."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_2"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1803.09845"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.778"},{"key":"e_1_3_2_1_22_1","volume-title":"Advances in Neural Information Processing Systems","volume":"27","author":"Malinowski Mateusz","year":"2014","unstructured":"Mateusz Malinowski and Mario Fritz . 2014 . A Multi-World Approach to Question Answering about Real-World Scenes Based on Uncertain Input . In Advances in Neural Information Processing Systems , Vol. 27 . Curran Associates, Inc. https:\/\/proceedings.neurips.cc\/paper\/ 2014\/hash\/d516b13671a4179d9b7b458a6ebdeb92-Abstract.html Mateusz Malinowski and Mario Fritz. 2014. A Multi-World Approach to Question Answering about Real-World Scenes Based on Uncertain Input. In Advances in Neural Information Processing Systems, Vol. 27. Curran Associates, Inc. https:\/\/proceedings.neurips.cc\/paper\/2014\/hash\/d516b13671a4179d9b7b458a6ebdeb92-Abstract.html"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00331"},{"key":"e_1_3_2_1_24_1","volume-title":"Ilchae Jung, and Bohyung Han.","author":"Mun Jonghwan","year":"2017","unstructured":"Jonghwan Mun , Paul Hongsuck Seo , Ilchae Jung, and Bohyung Han. 2017 . MarioQA: Answering Questions by Watching Gameplay Videos . (2017), 9. Jonghwan Mun, Paul Hongsuck Seo, Ilchae Jung, and Bohyung Han. 2017. MarioQA: Answering Questions by Watching Gameplay Videos. (2017), 9."},{"key":"e_1_3_2_1_25_1","volume-title":"Flickr30k Entities: Collecting Region-to-Phrase Correspondences for Richer Image-to-Sentence Models. arXiv:1505.04870 [cs] (Sept","author":"Plummer Bryan A.","year":"2016","unstructured":"Bryan A. Plummer , Liwei Wang , Chris M. Cervantes , Juan C. Caicedo , Julia Hockenmaier , and Svetlana Lazebnik . 2016. Flickr30k Entities: Collecting Region-to-Phrase Correspondences for Richer Image-to-Sentence Models. arXiv:1505.04870 [cs] (Sept . 2016 ). arXiv:1505.04870 [cs] http:\/\/arxiv.org\/abs\/1505.04870 Bryan A. Plummer, Liwei Wang, Chris M. Cervantes, Juan C. Caicedo, Julia Hockenmaier, and Svetlana Lazebnik. 2016. Flickr30k Entities: Collecting Region-to-Phrase Correspondences for Richer Image-to-Sentence Models. arXiv:1505.04870 [cs] (Sept. 2016). arXiv:1505.04870 [cs] http:\/\/arxiv.org\/abs\/1505.04870"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10590-1_7"},{"key":"e_1_3_2_1_27_1","volume-title":"Kaylee Burns, Trevor Darrell, and Kate Saenko.","author":"Rohrbach Anna","year":"2019","unstructured":"Anna Rohrbach , Lisa Anne Hendricks , Kaylee Burns, Trevor Darrell, and Kate Saenko. 2019 . Object Hallucination in Image Captioning . arXiv:1809.02156 [cs] (March 2019). arXiv:1809.02156 [cs] http:\/\/arxiv.org\/abs\/1809.02156 Anna Rohrbach, Lisa Anne Hendricks, Kaylee Burns, Trevor Darrell, and Kate Saenko. 2019. Object Hallucination in Image Captioning. arXiv:1809.02156 [cs] (March 2019). arXiv:1809.02156 [cs] http:\/\/arxiv.org\/abs\/1809.02156"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.501"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/IVS.2014.6856599"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/MMUL.2014.29"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-012-0564-1"},{"key":"e_1_3_2_1_33_1","volume-title":"You Only Learn One Representation: Unified Network for Multiple Tasks. arXiv:2105.04206 [cs] (May","author":"Wang Chien-Yao","year":"2021","unstructured":"Chien-Yao Wang , I.- Hau Yeh , and Hong-Yuan Mark Liao . 2021. You Only Learn One Representation: Unified Network for Multiple Tasks. arXiv:2105.04206 [cs] (May 2021 ). arXiv:2105.04206 [cs] http:\/\/arxiv.org\/abs\/2105.04206 Chien-Yao Wang, I.-Hau Yeh, and Hong-Yuan Mark Liao. 2021. You Only Learn One Representation: Unified Network for Multiple Tasks. arXiv:2105.04206 [cs] (May 2021). arXiv:2105.04206 [cs] http:\/\/arxiv.org\/abs\/2105.04206"},{"key":"e_1_3_2_1_34_1","unstructured":"Yuxin Wu Alexander Kirillov Francisco Massa Wan-Yen Lo and Ross Girshick. 2019. Detectron2. https:\/\/github.com\/facebookresearch\/detectron2  Yuxin Wu Alexander Kirillov Francisco Massa Wan-Yen Lo and Ross Girshick. 2019. Detectron2. https:\/\/github.com\/facebookresearch\/detectron2"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2009.5459289"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00688"}],"event":{"name":"SIGMOD\/PODS '22: International Conference on Management of Data","sponsor":["SIGMOD ACM Special Interest Group on Management of Data"],"location":"Philadelphia Pennsylvania","acronym":"SIGMOD\/PODS '22"},"container-title":["Proceedings of the Workshop on Human-In-the-Loop Data Analytics"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3546930.3547499","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3546930.3547499","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3546930.3547499","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:00:41Z","timestamp":1750186841000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3546930.3547499"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,6,12]]},"references-count":36,"alternative-id":["10.1145\/3546930.3547499","10.1145\/3546930"],"URL":"https:\/\/doi.org\/10.1145\/3546930.3547499","relation":{},"subject":[],"published":{"date-parts":[[2022,6,12]]},"assertion":[{"value":"2022-08-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}