{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:19:06Z","timestamp":1750220346979,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":32,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,8,24]],"date-time":"2021-08-24T00:00:00Z","timestamp":1629763200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,8,24]]},"DOI":"10.1145\/3460426.3463610","type":"proceedings-article","created":{"date-parts":[[2021,9,1]],"date-time":"2021-09-01T22:50:29Z","timestamp":1630536629000},"page":"349-357","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Learning Hierarchical Visual-Semantic Representation with Phrase Alignment"],"prefix":"10.1145","author":[{"given":"Baoming","family":"Yan","sequence":"first","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"given":"Qingheng","family":"Zhang","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"given":"Liyu","family":"Chen","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"given":"Lin","family":"Wang","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"given":"Leihao","family":"Pei","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"given":"Jiang","family":"Yang","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"given":"Enyun","family":"Yu","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"given":"Xiaobo","family":"Li","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"given":"Binqiang","family":"Zhao","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2021,9]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Bottom-Up and Top-Down Attention for Image Captioning and Visual Question Answering. In The IEEE Conference on Computer Vision and Pattern Recognition (2017-07--25)","author":"Anderson Peter","year":"2018","unstructured":"Peter Anderson , Xiaodong He , Chris Buehler , Damien Teney , Mark Johnson , Stephen Gould , and Lei Zhang . 2018 . Bottom-Up and Top-Down Attention for Image Captioning and Visual Question Answering. In The IEEE Conference on Computer Vision and Pattern Recognition (2017-07--25) . 6077--6086. showeprint[arXiv]http:\/\/arxiv.org\/abs\/1707.07998v3 [cs.CV] http:\/\/arxiv.org\/pdf\/1707.07998v3 Peter Anderson, Xiaodong He, Chris Buehler, Damien Teney, Mark Johnson, Stephen Gould, and Lei Zhang. 2018. Bottom-Up and Top-Down Attention for Image Captioning and Visual Question Answering. In The IEEE Conference on Computer Vision and Pattern Recognition (2017-07--25). 6077--6086. showeprint[arXiv]http:\/\/arxiv.org\/abs\/1707.07998v3 [cs.CV] http:\/\/arxiv.org\/pdf\/1707.07998v3"},{"key":"e_1_3_2_1_2_1","volume-title":"Proceedings of the 7th Linguistic Annotation Workshop and Interoperability with Discourse. Association for Computational Linguistics","author":"Banarescu Laura","year":"2013","unstructured":"Laura Banarescu , Claire Bonial , Shu Cai , Madalina Georgescu , Kira Griffitt , Ulf Hermjakob , Kevin Knight , Philipp Koehn , Martha Palmer , and Nathan Schneider . 2013 . Abstract Meaning Representation for Sembanking . In Proceedings of the 7th Linguistic Annotation Workshop and Interoperability with Discourse. Association for Computational Linguistics , Sofia, Bulgaria, 178--186. https:\/\/www.aclweb.org\/anthology\/W13--2322 Laura Banarescu, Claire Bonial, Shu Cai, Madalina Georgescu, Kira Griffitt, Ulf Hermjakob, Kevin Knight, Philipp Koehn, Martha Palmer, and Nathan Schneider. 2013. Abstract Meaning Representation for Sembanking. In Proceedings of the 7th Linguistic Annotation Workshop and Interoperability with Discourse. Association for Computational Linguistics, Sofia, Bulgaria, 178--186. https:\/\/www.aclweb.org\/anthology\/W13--2322"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.201"},{"key":"e_1_3_2_1_4_1","volume-title":"Jamie Ryan Kiros, and Sanja Fidler","author":"Faghri Fartash","year":"2017","unstructured":"Fartash Faghri , David J. Fleet , Jamie Ryan Kiros, and Sanja Fidler . 2017 . VSE+: Improved Visual-Semantic Embeddings. CoRR , Vol. abs\/ 1707 .05612 (2017). arxiv: 1707.05612 http:\/\/arxiv.org\/abs\/1707.05612 Fartash Faghri, David J. Fleet, Jamie Ryan Kiros, and Sanja Fidler. 2017. VSE+: Improved Visual-Semantic Embeddings. CoRR, Vol. abs\/1707.05612 (2017). arxiv: 1707.05612 http:\/\/arxiv.org\/abs\/1707.05612"},{"key":"e_1_3_2_1_5_1","volume-title":"Marctextquotesingle Aurelio Ranzato, and Tomas Mikolov","author":"Frome Andrea","year":"2013","unstructured":"Andrea Frome , Greg S Corrado , Jon Shlens , Samy Bengio , Jeff Dean , Marctextquotesingle Aurelio Ranzato, and Tomas Mikolov . 2013 . DeViSE: A Deep Visual-Semantic Embedding Model. In Advances in Neural Information Processing Systems 26, C. J. C. Burges, L. Bottou, M. Welling, Z. Ghahramani, and K. Q. Weinberger (Eds.). Curran Associates, Inc ., 2121--2129. http:\/\/papers.nips.cc\/paper\/5204-devise-a-deep-visual-semantic-embedding-model.pdf Andrea Frome, Greg S Corrado, Jon Shlens, Samy Bengio, Jeff Dean, Marctextquotesingle Aurelio Ranzato, and Tomas Mikolov. 2013. DeViSE: A Deep Visual-Semantic Embedding Model. In Advances in Neural Information Processing Systems 26, C. J. C. Burges, L. Bottou, M. Welling, Z. Ghahramani, and K. Q. Weinberger (Eds.). Curran Associates, Inc., 2121--2129. http:\/\/papers.nips.cc\/paper\/5204-devise-a-deep-visual-semantic-embedding-model.pdf"},{"key":"e_1_3_2_1_6_1","volume-title":"Rich Feature Hierarchies for Accurate Object Detection and Semantic Segmentation. In The IEEE Conference on Computer Vision and Pattern Recognition.","author":"Girshick Ross","year":"2014","unstructured":"Ross Girshick , Jeff Donahue , Trevor Darrell , and Jitendra Malik . 2014 . Rich Feature Hierarchies for Accurate Object Detection and Semantic Segmentation. In The IEEE Conference on Computer Vision and Pattern Recognition. Ross Girshick, Jeff Donahue, Trevor Darrell, and Jitendra Malik. 2014. Rich Feature Hierarchies for Accurate Object Detection and Semantic Segmentation. In The IEEE Conference on Computer Vision and Pattern Recognition."},{"key":"e_1_3_2_1_7_1","volume-title":"Imagine and Match: Improving Textual-Visual Cross-Modal Retrieval With Generative Models. In The IEEE Conference on Computer Vision and Pattern Recognition.","author":"Gu Jiuxiang","year":"2018","unstructured":"Jiuxiang Gu , Jianfei Cai , Shafiq R. Joty , Li Niu , and Gang Wang . 2018 . Look , Imagine and Match: Improving Textual-Visual Cross-Modal Retrieval With Generative Models. In The IEEE Conference on Computer Vision and Pattern Recognition. Jiuxiang Gu, Jianfei Cai, Shafiq R. Joty, Li Niu, and Gang Wang. 2018. Look, Imagine and Match: Improving Textual-Visual Cross-Modal Retrieval With Generative Models. In The IEEE Conference on Computer Vision and Pattern Recognition."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00587"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00645"},{"key":"e_1_3_2_1_10_1","volume-title":"Image Generation From Scene Graphs. In The IEEE Conference on Computer Vision and Pattern Recognition.","author":"Johnson Justin","year":"2018","unstructured":"Justin Johnson , Agrim Gupta , and Li Fei-Fei . 2018 . Image Generation From Scene Graphs. In The IEEE Conference on Computer Vision and Pattern Recognition. Justin Johnson, Agrim Gupta, and Li Fei-Fei. 2018. Image Generation From Scene Graphs. In The IEEE Conference on Computer Vision and Pattern Recognition."},{"key":"e_1_3_2_1_11_1","volume-title":"Image Retrieval Using Scene Graphs. In The IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","author":"Johnson Justin","year":"2015","unstructured":"Justin Johnson , Ranjay Krishna , Michael Stark , Li-Jia Li , David Shamma , Michael Bernstein , and Li Fei-Fei . 2015 . Image Retrieval Using Scene Graphs. In The IEEE Conference on Computer Vision and Pattern Recognition (CVPR). Justin Johnson, Ranjay Krishna, Michael Stark, Li-Jia Li, David Shamma, Michael Bernstein, and Li Fei-Fei. 2015. Image Retrieval Using Scene Graphs. In The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)."},{"key":"e_1_3_2_1_12_1","volume-title":"Deep Visual-Semantic Alignments for Generating Image Descriptions. In The IEEE Conference on Computer Vision and Pattern Recognition.","author":"Karpathy Andrej","year":"2015","unstructured":"Andrej Karpathy and Li Fei-Fei . 2015 . Deep Visual-Semantic Alignments for Generating Image Descriptions. In The IEEE Conference on Computer Vision and Pattern Recognition. Andrej Karpathy and Li Fei-Fei. 2015. Deep Visual-Semantic Alignments for Generating Image Descriptions. In The IEEE Conference on Computer Vision and Pattern Recognition."},{"key":"e_1_3_2_1_13_1","volume-title":"Bottom-up and top-down attention: different processes and overlapping neural systems. The Neuroscientist : a review journal bringing neurobiology, neurology and psychiatry","author":"Katsuki Fumi","year":"2014","unstructured":"Fumi Katsuki and Christos Constantinidis . 2014. Bottom-up and top-down attention: different processes and overlapping neural systems. The Neuroscientist : a review journal bringing neurobiology, neurology and psychiatry , Vol. 20 , 5 ( October 2014 ), 509-521. https:\/\/doi.org\/10.1177\/1073858413514136 10.1177\/1073858413514136 Fumi Katsuki and Christos Constantinidis. 2014. Bottom-up and top-down attention: different processes and overlapping neural systems. The Neuroscientist : a review journal bringing neurobiology, neurology and psychiatry, Vol. 20, 5 (October 2014), 509-521. https:\/\/doi.org\/10.1177\/1073858413514136"},{"key":"e_1_3_2_1_14_1","volume-title":"Zemel","author":"Kiros Ryan","year":"2014","unstructured":"Ryan Kiros , Ruslan Salakhutdinov , and Richard S . Zemel . 2014 . Unifying Visual-Semantic Embeddings with Multimodal Neural Language Models. CoRR , Vol. abs\/ 1411 .2539 (2014). arxiv: 1411.2539 http:\/\/arxiv.org\/abs\/1411.2539 Ryan Kiros, Ruslan Salakhutdinov, and Richard S. Zemel. 2014. Unifying Visual-Semantic Embeddings with Multimodal Neural Language Models. CoRR, Vol. abs\/1411.2539 (2014). arxiv: 1411.2539 http:\/\/arxiv.org\/abs\/1411.2539"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00475"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350869"},{"key":"e_1_3_2_1_19_1","volume-title":"Visual Relationship Detection with Language Priors. CoRR","author":"Lu Cewu","year":"2016","unstructured":"Cewu Lu , Ranjay Krishna , Michael S. Bernstein , and Fei-Fei Li. 2016. Visual Relationship Detection with Language Priors. CoRR , Vol. abs\/ 1608 .00187 ( 2016 ). arxiv: 1608.00187 http:\/\/arxiv.org\/abs\/1608.00187 Cewu Lu, Ranjay Krishna, Michael S. Bernstein, and Fei-Fei Li. 2016. Visual Relationship Detection with Language Priors. CoRR, Vol. abs\/1608.00187 (2016). arxiv: 1608.00187 http:\/\/arxiv.org\/abs\/1608.00187"},{"volume-title":"Advances in Neural Information Processing Systems 26","author":"Mikolov Tomas","key":"e_1_3_2_1_20_1","unstructured":"Tomas Mikolov , Ilya Sutskever , Kai Chen , Greg S Corrado , and Jeff Dean . 2013. Distributed Representations of Words and Phrases and their Compositionality . In Advances in Neural Information Processing Systems 26 , C. J. C. Burges, L. Bottou, M. Welling, Z. Ghahramani, and K. Q. Weinberger (Eds.). Curran Associates, Inc. , 3111--3119. http:\/\/papers.nips.cc\/paper\/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg S Corrado, and Jeff Dean. 2013. Distributed Representations of Words and Phrases and their Compositionality. In Advances in Neural Information Processing Systems 26, C. J. C. Burges, L. Bottou, M. Welling, Z. Ghahramani, and K. Q. Weinberger (Eds.). Curran Associates, Inc., 3111--3119. http:\/\/papers.nips.cc\/paper\/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.232"},{"key":"e_1_3_2_1_22_1","unstructured":"Shaoqing Ren Kaiming He Ross Girshick and Jian Sun. 2015. Faster r-cnn: Towards real-time object detection with region proposal networks. In Advances in neural information processing systems. 91--99.  Shaoqing Ren Kaiming He Ross Girshick and Jian Sun. 2015. Faster r-cnn: Towards real-time object detection with region proposal networks. In Advances in neural information processing systems. 91--99."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00208"},{"key":"e_1_3_2_1_24_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. In Advances in Neural Information Processing Systems. 5998--6008.  Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. In Advances in Neural Information Processing Systems. 5998--6008."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350875"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2019\/526"},{"key":"e_1_3_2_1_27_1","volume-title":"Unified Visual-Semantic Embeddings: Bridging Vision and Language With Structured Meaning Representations. In The IEEE Conference on Computer Vision and Pattern Recognition.","author":"Wu Hao","year":"2019","unstructured":"Hao Wu , Jiayuan Mao , Yufeng Zhang , Yuning Jiang , Lei Li , Weiwei Sun , and Wei-Ying Ma . 2019 a . Unified Visual-Semantic Embeddings: Bridging Vision and Language With Structured Meaning Representations. In The IEEE Conference on Computer Vision and Pattern Recognition. Hao Wu, Jiayuan Mao, Yufeng Zhang, Yuning Jiang, Lei Li, Weiwei Sun, and Wei-Ying Ma. 2019 a. Unified Visual-Semantic Embeddings: Bridging Vision and Language With Structured Meaning Representations. In The IEEE Conference on Computer Vision and Pattern Recognition."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350940"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"e_1_3_2_1_30_1","volume-title":"Zettlemoyer and Michael Collins","author":"Luke","year":"2012","unstructured":"Luke S. Zettlemoyer and Michael Collins . 2012 . Learning to Map Sentences to Logical Form : Structured Classification with Probabilistic Categorial Grammars. CoRR , Vol. abs\/ 1207 .1420 (2012). arxiv: 1207.1420 http:\/\/arxiv.org\/abs\/1207.1420 Luke S. Zettlemoyer and Michael Collins. 2012. Learning to Map Sentences to Logical Form: Structured Classification with Probabilistic Categorial Grammars. CoRR, Vol. abs\/1207.1420 (2012). arxiv: 1207.1420 http:\/\/arxiv.org\/abs\/1207.1420"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00359"},{"key":"e_1_3_2_1_32_1","volume-title":"Dual-path convolutional image-text embedding with instance loss. arXiv preprint arXiv:1711.05535","author":"Zheng Zhedong","year":"2017","unstructured":"Zhedong Zheng , Liang Zheng , Michael Garrett , Yi Yang , and Yi-Dong Shen . 2017. Dual-path convolutional image-text embedding with instance loss. arXiv preprint arXiv:1711.05535 ( 2017 ). Zhedong Zheng, Liang Zheng, Michael Garrett, Yi Yang, and Yi-Dong Shen. 2017. Dual-path convolutional image-text embedding with instance loss. arXiv preprint arXiv:1711.05535 (2017)."}],"event":{"name":"ICMR '21: International Conference on Multimedia Retrieval","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Taipei Taiwan","acronym":"ICMR '21"},"container-title":["Proceedings of the 2021 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3460426.3463610","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3460426.3463610","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:17:03Z","timestamp":1750191423000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3460426.3463610"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,8,24]]},"references-count":32,"alternative-id":["10.1145\/3460426.3463610","10.1145\/3460426"],"URL":"https:\/\/doi.org\/10.1145\/3460426.3463610","relation":{},"subject":[],"published":{"date-parts":[[2021,8,24]]},"assertion":[{"value":"2021-09-01","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}