{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,7,24]],"date-time":"2025-07-24T12:29:26Z","timestamp":1753360166593,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","license":[{"start":{"date-parts":[[2019,10,15]],"date-time":"2019-10-15T00:00:00Z","timestamp":1571097600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Open Research Fund of Beijing Key Laboratory of Big Data Technology for Food Safety Beijing Technology and Business University","award":["BTBD-2018KF05"],"award-info":[{"award-number":["BTBD-2018KF05"]}]},{"name":"the Science and Technology Major Project of Guangxi","award":["GuikeAA18118054"],"award-info":[{"award-number":["GuikeAA18118054"]}]},{"name":"the National Natural Science Foundation of China","award":["No. 61877006 No. 61532006 No. 61772083 No. 61802028"],"award-info":[{"award-number":["No. 61877006 No. 61532006 No. 61772083 No. 61802028"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2019,10,15]]},"DOI":"10.1145\/3343031.3350892","type":"proceedings-article","created":{"date-parts":[[2019,10,21]],"date-time":"2019-10-21T16:32:26Z","timestamp":1571675546000},"page":"1313-1321","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Fine-grained Cross-media Representation Learning with Deep Quantization Attention Network"],"prefix":"10.1145","author":[{"given":"Meiyu","family":"Liang","sequence":"first","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"given":"Junping","family":"Du","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"given":"Wu","family":"Liu","sequence":"additional","affiliation":[{"name":"AI Research of JD.com, Beijing, China"}]},{"given":"Zhe","family":"Xue","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"given":"Yue","family":"Geng","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"given":"Congxian","family":"Yang","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2019,10,15]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Galen Andrew Raman Arora Jeff Bilmes and Karen Livescu. 2013. Deep Canonical Correlation Analysis. In ICML. 1247--1255.  Galen Andrew Raman Arora Jeff Bilmes and Karen Livescu. 2013. Deep Canonical Correlation Analysis. In ICML. 1247--1255."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"M. M. Bronstein A. M. Bronstein F. Michel and N. Paragios. 2010. Data fusion through cross-modality metric learning using similarity-sensitive hashing. In CVPR . 3594--3601.  M. M. Bronstein A. M. Bronstein F. Michel and N. Paragios. 2010. Data fusion through cross-modality metric learning using similarity-sensitive hashing. In CVPR . 3594--3601.","DOI":"10.1109\/CVPR.2010.5539928"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Yue Cao Mingsheng Long and Jianmin Wang. 2017a. Correlation Hashing Network for Efficient Cross-Modal Retrieval. In BMVL . 1--12.  Yue Cao Mingsheng Long and Jianmin Wang. 2017a. Correlation Hashing Network for Efficient Cross-Modal Retrieval. In BMVL . 1--12.","DOI":"10.5244\/C.31.128"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Yue Cao Mingsheng Long Jianmin Wang and Shichen Liu. 2017b. Collective Deep Quantization for Efficient Cross-Modal Retrieval. In AAAI . 3974--3980.  Yue Cao Mingsheng Long Jianmin Wang and Shichen Liu. 2017b. Collective Deep Quantization for Efficient Cross-Modal Retrieval. In AAAI . 3974--3980.","DOI":"10.1609\/aaai.v31i1.11218"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Y. Cao M. Long J. Wang and S. Liu. 2017. Deep Visual-Semantic Quantization for Efficient Image Retrieval. In CVPR . 916--925.  Y. Cao M. Long J. Wang and S. Liu. 2017. Deep Visual-Semantic Quantization for Efficient Image Retrieval. In CVPR . 916--925.","DOI":"10.1109\/CVPR.2017.104"},{"volume-title":"Yu","year":"2016","author":"Cao Yue","key":"e_1_3_2_1_6_1"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","unstructured":"Yue Cao Mingsheng Long Jianmin Wang Han Zhu and Qingfu Wen. 2016b. Deep Quantization Network for Efficient Image Retrieval. In AAAI. 3457--3463.  Yue Cao Mingsheng Long Jianmin Wang Han Zhu and Qingfu Wen. 2016b. Deep Quantization Network for Efficient Image Retrieval. In AAAI. 3457--3463.","DOI":"10.1609\/aaai.v30i1.10455"},{"volume-title":"Yu","year":"2017","author":"Cao Zhangjie","key":"e_1_3_2_1_8_1"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Tat-Seng Chua Jinhui Tang Richang Hong Haojie Li Zhiping Luo and Yantao Zheng. 2009. NUS-WIDE: A Real-world Web Image Database from National University of Singapore. In CIVR . Article 48 bibinfonumpages9 pages.  Tat-Seng Chua Jinhui Tang Richang Hong Haojie Li Zhiping Luo and Yantao Zheng. 2009. NUS-WIDE: A Real-world Web Image Database from National University of Singapore. In CIVR . Article 48 bibinfonumpages9 pages.","DOI":"10.1145\/1646396.1646452"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"Fangxiang Feng Xiaojie Wang and Ruifan Li. 2014. Cross-modal Retrieval with Correspondence Autoencoder. In ACM MM. 7--16.  Fangxiang Feng Xiaojie Wang and Ruifan Li. 2014. Cross-modal Retrieval with Correspondence Autoencoder. In ACM MM. 7--16.","DOI":"10.1145\/2647868.2654902"},{"volume-title":"Guibas","year":"2018","author":"Gan Chuang","key":"e_1_3_2_1_11_1"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Chuang Gan Chen Sun Lixin Duan and Boqing Gong. 2016a. Webly-supervised video recognition by mutually voting for relevant web images and web video frames. In ECCV. 849--866.  Chuang Gan Chen Sun Lixin Duan and Boqing Gong. 2016a. Webly-supervised video recognition by mutually voting for relevant web images and web video frames. In ECCV. 849--866.","DOI":"10.1007\/978-3-319-46487-9_52"},{"key":"e_1_3_2_1_13_1","unstructured":"Chuang Gan Tianbao Yang and Boqing Gong. 2016b. Learning attributes equals multi-source domain generalization. In CVPR . 87--97.  Chuang Gan Tianbao Yang and Boqing Gong. 2016b. Learning attributes equals multi-source domain generalization. In CVPR . 87--97."},{"key":"e_1_3_2_1_14_1","unstructured":"Ian Goodfellow Jean Pouget-Abadie Mehdi Mirza Bing Xu David Warde-Farley Sherjil Ozair Aaron Courville and Yoshua Bengio. 2014. Generative Adversarial Nets. In NIPS. 2672--2680.  Ian Goodfellow Jean Pouget-Abadie Mehdi Mirza Bing Xu David Warde-Farley Sherjil Ozair Aaron Courville and Yoshua Bengio. 2014. Generative Adversarial Nets. In NIPS. 2672--2680."},{"key":"#cr-split#-e_1_3_2_1_15_1.1","doi-asserted-by":"crossref","unstructured":"X. He and Y. Peng. 2019. Fine-grained Visual-textual Representation Learning. IEEE T CIRC SYST VID DOI: 10.1109\/TCSVT.2019.2892802 (2019) 1--12. 10.1109\/TCSVT.2019.2892802","DOI":"10.1109\/TCSVT.2019.2892802"},{"key":"#cr-split#-e_1_3_2_1_15_1.2","doi-asserted-by":"crossref","unstructured":"X. He and Y. Peng. 2019. Fine-grained Visual-textual Representation Learning. IEEE T CIRC SYST VID DOI: 10.1109\/TCSVT.2019.2892802 (2019) 1--12.","DOI":"10.1109\/TCSVT.2019.2892802"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2017.2760101"},{"volume-title":"Lew","year":"2008","author":"Huiskes Mark J.","key":"e_1_3_2_1_18_1"},{"key":"e_1_3_2_1_19_1","unstructured":"Y. Yang J. Wang Z. Huang J. Mao W. Xu and A. Yuille. 2015. Deep captioning with multimodal recurrent neural networks (m-RNN). In ICLR .  Y. Yang J. Wang Z. Huang J. Mao W. Xu and A. Yuille. 2015. Deep captioning with multimodal recurrent neural networks (m-RNN). In ICLR ."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Qing-Yuan Jiang and Wu-Jun Li. 2017. Deep Cross-Modal Hashing. In CVPR. 3270--3278.  Qing-Yuan Jiang and Wu-Jun Li. 2017. Deep Cross-Modal Hashing. In CVPR. 3270--3278.","DOI":"10.1109\/CVPR.2017.348"},{"key":"e_1_3_2_1_21_1","unstructured":"Kuang-Huei Lee Xi Chen Gang Hua Houdong Hu and Xiaodong He. 2018. Stacked Cross Attention for Image-Text Matching. In ECCV .  Kuang-Huei Lee Xi Chen Gang Hua Houdong Hu and Xiaodong He. 2018. Stacked Cross Attention for Image-Text Matching. In ECCV ."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Zijia Lin Guiguang Ding Mingqing Hu and Jianmin Wang. 2015. Semantics-Preserving Hashing for Cross-View Retrieval. In CVPR. 3864--3872.  Zijia Lin Guiguang Ding Mingqing Hu and Jianmin Wang. 2015. Semantics-Preserving Hashing for Cross-View Retrieval. In CVPR. 3864--3872.","DOI":"10.1109\/CVPR.2015.7299011"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"crossref","unstructured":"Bin Liu Yue Cao Mingsheng Long Jianmin Wang and Jingdong Wang. 2018. Deep Triplet Quantization. In ACM MM .  Bin Liu Yue Cao Mingsheng Long Jianmin Wang and Jingdong Wang. 2018. Deep Triplet Quantization. In ACM MM .","DOI":"10.1145\/3240508.3240516"},{"volume-title":"A New Evaluation Protocol and Benchmarking Results for Extendable Cross-media Retrieval. CoRR","year":"2017","author":"Liu Ruoyu","key":"e_1_3_2_1_24_1"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"T. Liu Y. Zhao S. Wei Y. Wei and L. Liao. 2017. Enhanced isomorphic semantic representation for cross-media retrieval. In ICME . 967--972.  T. Liu Y. Zhao S. Wei Y. Wei and L. Liao. 2017. Enhanced isomorphic semantic representation for cross-media retrieval. In ICME . 967--972.","DOI":"10.1109\/ICME.2017.8019356"},{"key":"e_1_3_2_1_26_1","article-title":"Deep learning hashing for mobile visual search","volume":"1","author":"Liu W.","year":"2017","journal-title":"EURASIP Journal on Image and Video Processing"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","unstructured":"W. Liu T. Mei Y. Zhang J. Li and S. Li. 2013. Listen look and gotcha: instant video search with mobile phones by layered audio-video indexings. In ACM MM. 887--896.  W. Liu T. Mei Y. Zhang J. Li and S. Li. 2013. Listen look and gotcha: instant video search with mobile phones by layered audio-video indexings. In ACM MM. 887--896.","DOI":"10.1145\/2502081.2502084"},{"volume-title":"Yu","year":"2016","author":"Long Mingsheng","key":"e_1_3_2_1_28_1"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/MMUL.2017.265091429"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2013.225"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-015-2952-3"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1049\/cje.2015.07.016"},{"key":"e_1_3_2_1_33_1","unstructured":"Tomas Mikolov Ilya Sutskever Kai Chen Greg S Corrado and Jeff Dean. 2013. Distributed Representations of Words and Phrases and their Compositionality. In NIPS. 3111--3119.  Tomas Mikolov Ilya Sutskever Kai Chen Greg S Corrado and Jeff Dean. 2013. Distributed Representations of Words and Phrases and their Compositionality. In NIPS. 3111--3119."},{"key":"e_1_3_2_1_34_1","first-page":"2372","article-title":"An Overview of Cross-Media Retrieval: Concepts, Methodologies, Benchmarks, and Challenges","volume":"28","author":"Peng Y.","year":"2018","journal-title":"IEEE TCSVT"},{"key":"e_1_3_2_1_35_1","volume-title":"ACM TOMM","volume":"15","author":"Peng Yuxin","year":"2017"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2852503"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Bryan Perozzi Rami Al-Rfou and Steven Skiena. 2014. DeepWalk: Online Learning of Social Representations. In ACM SIGKDD . 701--710.  Bryan Perozzi Rami Al-Rfou and Steven Skiena. 2014. DeepWalk: Online Learning of Social Representations. In ACM SIGKDD . 701--710.","DOI":"10.1145\/2623330.2623732"},{"key":"e_1_3_2_1_38_1","unstructured":"Jinwei Qi Yuxin Peng and Yuxin Yuan. 2018. Cross-media Multi-level Alignment with Relation Attention Network. In IJCAI . 892--898.  Jinwei Qi Yuxin Peng and Yuxin Yuan. 2018. Cross-media Multi-level Alignment with Relation Attention Network. In IJCAI . 892--898."},{"key":"e_1_3_2_1_39_1","unstructured":"Karen Simonyan and Andrew Zisserman. 2015. Very Deep Convolutional Networks for Large-Scale Image Recognition. In ICLR . 1--14.  Karen Simonyan and Andrew Zisserman. 2015. Very Deep Convolutional Networks for Large-Scale Image Recognition. In ICLR . 1--14."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1162\/089976600300015349"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"Bokun Wang Yang Yang Xing Xu Alan Hanjalic and Heng Tao Shen. 2017. Adversarial Cross-Modal Retrieval. In ACM MM. 154--162.  Bokun Wang Yang Yang Xing Xu Alan Hanjalic and Heng Tao Shen. 2017. Adversarial Cross-Modal Retrieval. In ACM MM. 154--162.","DOI":"10.1145\/3123266.3123326"},{"key":"#cr-split#-e_1_3_2_1_42_1.1","doi-asserted-by":"crossref","unstructured":"D. Wang X.. Gao X. Wang and L. He. 2018. Label Consistent Matrix Factorization Hashing for Large-Scale Cross-Modal Similarity Search. TPAMI (2018) DOI: 10.1109\/TPAMI.2018.2861000. 10.1109\/TPAMI.2018.2861000","DOI":"10.1109\/TPAMI.2018.2861000"},{"key":"#cr-split#-e_1_3_2_1_42_1.2","doi-asserted-by":"crossref","unstructured":"D. Wang X.. Gao X. Wang and L. He. 2018. Label Consistent Matrix Factorization Hashing for Large-Scale Cross-Modal Similarity Search. TPAMI (2018) DOI: 10.1109\/TPAMI.2018.2861000.","DOI":"10.1109\/TPAMI.2018.2861000"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Shuhui Wang Yangyu Chen Junbao Zhuo Qingming Huang and Qi Tian. 2018. Joint Global and Co-Attentive Representation Learning for Image-Sentence Retrieval. In ACM MM. 1398--1406.  Shuhui Wang Yangyu Chen Junbao Zhuo Qingming Huang and Qi Tian. 2018. Joint Global and Co-Attentive Representation Learning for Image-Sentence Retrieval. In ACM MM. 1398--1406.","DOI":"10.1145\/3240508.3240535"},{"key":"e_1_3_2_1_44_1","first-page":"1083","article-title":"On Deep Multi-View Representation Learning","volume":"37","author":"Wang Weiran","year":"2015","journal-title":"ICML"},{"key":"e_1_3_2_1_45_1","first-page":"449","article-title":"Cross-Modal Retrieval With CNN Visual Features: A New Baseline","volume":"47","author":"Wei Y.","year":"2017","journal-title":"IEEE T CYBERNETICS"},{"key":"e_1_3_2_1_46_1","first-page":"327","article-title":"Bilevel Multiview Latent Space Learning","volume":"28","author":"Xue Z.","year":"2018","journal-title":"IEEE TCSVT"},{"key":"e_1_3_2_1_47_1","unstructured":"Zhaoda Ye and Yuxin Peng. 2018. Multi-Scale Correlation for Sequential Cross-modal Hashing Learning. In ACM MM . 852--860.  Zhaoda Ye and Yuxin Peng. 2018. Multi-Scale Correlation for Sequential Cross-modal Hashing Learning. In ACM MM . 852--860."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2016.2609463"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/2978656"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"crossref","unstructured":"Hanwang Zhang Xindi Shang Huanbo Luan Yang Yang and Tat-Seng Chua. 2015. Learning Features from Large-Scale Noisy and Social Image-Tag Collection. In ACM MM . 1079--1082.  Hanwang Zhang Xindi Shang Huanbo Luan Yang Yang and Tat-Seng Chua. 2015. Learning Features from Large-Scale Noisy and Social Image-Tag Collection. In ACM MM . 1079--1082.","DOI":"10.1145\/2733373.2806286"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","unstructured":"Jian Zhang Yuxin Peng and Mingkuan Yuan. 2018a. Unsupervised Generative Adversarial Cross-modal Hashing. In AAAI. 539--546.  Jian Zhang Yuxin Peng and Mingkuan Yuan. 2018a. Unsupervised Generative Adversarial Cross-modal Hashing. In AAAI. 539--546.","DOI":"10.1609\/aaai.v32i1.11263"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2015.2507127"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"crossref","unstructured":"Xi Zhang Siyu Zhou Jiashi Feng Hanjiang Lai Bo Li Yan Pan Jian Yin and Shuicheng Yan. 2018b. HashGAN: Attention-aware Deep Adversarial Hashing for Cross Modal Retrieval. In ECCV .  Xi Zhang Siyu Zhou Jiashi Feng Hanjiang Lai Bo Li Yan Pan Jian Yin and Shuicheng Yan. 2018b. HashGAN: Attention-aware Deep Adversarial Hashing for Cross Modal Retrieval. In ECCV .","DOI":"10.1007\/978-3-030-01267-0_36"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"crossref","unstructured":"Jile Zhou Guiguang Ding and Yuchen Guo. 2014. Latent Semantic Sparse Hashing for Cross-modal Similarity Search. In ACM SIGIR . 415--424.  Jile Zhou Guiguang Ding and Yuchen Guo. 2014. Latent Semantic Sparse Hashing for Cross-modal Similarity Search. In ACM SIGIR . 415--424.","DOI":"10.1145\/2600428.2609610"}],"event":{"name":"MM '19: The 27th ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Nice France","acronym":"MM '19"},"container-title":["Proceedings of the 27th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3343031.3350892","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3343031.3350892","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T23:13:17Z","timestamp":1750201997000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3343031.3350892"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,10,15]]},"references-count":56,"alternative-id":["10.1145\/3343031.3350892","10.1145\/3343031"],"URL":"https:\/\/doi.org\/10.1145\/3343031.3350892","relation":{},"subject":[],"published":{"date-parts":[[2019,10,15]]},"assertion":[{"value":"2019-10-15","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}