{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:16:15Z","timestamp":1750220175745,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":6,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,10,10]],"date-time":"2022-10-10T00:00:00Z","timestamp":1665360000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,10,10]]},"DOI":"10.1145\/3503161.3554766","type":"proceedings-article","created":{"date-parts":[[2022,10,10]],"date-time":"2022-10-10T15:42:46Z","timestamp":1665416566000},"page":"7418-7419","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["PIC'22"],"prefix":"10.1145","author":[{"given":"Si","family":"Liu","sequence":"first","affiliation":[{"name":"Beihang University, Beijing, China"}]},{"given":"Qin","family":"Jin","sequence":"additional","affiliation":[{"name":"Renmin University of China, Beijing, China"}]},{"given":"Luoqi","family":"Liu","sequence":"additional","affiliation":[{"name":"Meitu Inc, Beijing, China"}]},{"given":"Zongheng","family":"Tang","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}]},{"given":"Linli","family":"Lin","sequence":"additional","affiliation":[{"name":"Renmin University of China, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2022,10,10]]},"reference":[{"doi-asserted-by":"crossref","unstructured":"Lisa Anne Hendricks OliverWang Eli Shechtman Josef Sivic Trevor Darrell and Bryan Russell. 2017. Localizing moments in video with natural language. In ICCV.  Lisa Anne Hendricks OliverWang Eli Shechtman Josef Sivic Trevor Darrell and Bryan Russell. 2017. Localizing moments in video with natural language. In ICCV.","key":"e_1_3_2_1_1_1","DOI":"10.1109\/ICCV.2017.618"},{"key":"e_1_3_2_1_2_1","volume-title":"Tall: Temporal activity localization via language query. In ICCV.","author":"Gao Jiyang","year":"2017","unstructured":"Jiyang Gao , Chen Sun , Zhenheng Yang , and Ram Nevatia . 2017 . Tall: Temporal activity localization via language query. In ICCV. Jiyang Gao, Chen Sun, Zhenheng Yang, and Ram Nevatia. 2017. Tall: Temporal activity localization via language query. In ICCV."},{"doi-asserted-by":"crossref","unstructured":"Ranjay Krishna Kenji Hata Frederic Ren Li Fei-Fei and Juan Carlos Niebles. 2017. Dense-captioning events in videos. In ICCV.  Ranjay Krishna Kenji Hata Frederic Ren Li Fei-Fei and Juan Carlos Niebles. 2017. Dense-captioning events in videos. In ICCV.","key":"e_1_3_2_1_3_1","DOI":"10.1109\/ICCV.2017.83"},{"doi-asserted-by":"crossref","unstructured":"Gunnar A Sigurdsson G\u00fcl Varol Xiaolong Wang Ali Farhadi Ivan Laptev and Abhinav Gupta. 2016. Hollywood in homes: Crowdsourcing data collection for activity understanding. In ECCV.  Gunnar A Sigurdsson G\u00fcl Varol Xiaolong Wang Ali Farhadi Ivan Laptev and Abhinav Gupta. 2016. Hollywood in homes: Crowdsourcing data collection for activity understanding. In ECCV.","key":"e_1_3_2_1_4_1","DOI":"10.1007\/978-3-319-46448-0_31"},{"key":"e_1_3_2_1_5_1","volume-title":"Human-centric spatio-temporal video grounding with visual transformers. TCSVT","author":"Tang Zongheng","year":"2021","unstructured":"Zongheng Tang , Yue Liao , Si Liu , Guanbin Li , Xiaojie Jin , Hongxu Jiang , Qian Yu , and Dong Xu. 2021. Human-centric spatio-temporal video grounding with visual transformers. TCSVT ( 2021 ). Zongheng Tang, Yue Liao, Si Liu, Guanbin Li, Xiaojie Jin, Hongxu Jiang, Qian Yu, and Dong Xu. 2021. Human-centric spatio-temporal video grounding with visual transformers. TCSVT (2021)."},{"key":"e_1_3_2_1_6_1","volume-title":"Youmakeup: A large-scale domain-specific multimodal dataset for fine-grained semantic comprehension. In EMNLP.","author":"Wang Weiying","year":"2019","unstructured":"Weiying Wang , Yongcheng Wang , Shizhe Chen , and Qin Jin . 2019 . Youmakeup: A large-scale domain-specific multimodal dataset for fine-grained semantic comprehension. In EMNLP. Weiying Wang, Yongcheng Wang, Shizhe Chen, and Qin Jin. 2019. Youmakeup: A large-scale domain-specific multimodal dataset for fine-grained semantic comprehension. In EMNLP."}],"event":{"sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"acronym":"MM '22","name":"MM '22: The 30th ACM International Conference on Multimedia","location":"Lisboa Portugal"},"container-title":["Proceedings of the 30th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503161.3554766","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3503161.3554766","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:00:48Z","timestamp":1750186848000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503161.3554766"}},"subtitle":["4th Person in Context Workshop"],"short-title":[],"issued":{"date-parts":[[2022,10,10]]},"references-count":6,"alternative-id":["10.1145\/3503161.3554766","10.1145\/3503161"],"URL":"https:\/\/doi.org\/10.1145\/3503161.3554766","relation":{},"subject":[],"published":{"date-parts":[[2022,10,10]]},"assertion":[{"value":"2022-10-10","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}