{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,21]],"date-time":"2026-02-21T18:28:29Z","timestamp":1771698509044,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":61,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,10,17]],"date-time":"2021-10-17T00:00:00Z","timestamp":1634428800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"the National Natural Science Foundation of China","award":["61972022;1936212"],"award-info":[{"award-number":["61972022;1936212"]}]},{"name":"the National Key Research and Development of China","award":["2017YFC1703503"],"award-info":[{"award-number":["2017YFC1703503"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,10,17]]},"DOI":"10.1145\/3474085.3475659","type":"proceedings-article","created":{"date-parts":[[2021,10,18]],"date-time":"2021-10-18T06:35:51Z","timestamp":1634538951000},"page":"5353-5362","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":31,"title":["Heterogeneous Feature Fusion and Cross-modal Alignment for Composed Image Retrieval"],"prefix":"10.1145","author":[{"given":"Gangjian","family":"Zhang","sequence":"first","affiliation":[{"name":"Beijing Jiaotong University &amp; Beijing Key Laboratory of Advanced Information Science and Network Technology, Beijing, China"}]},{"given":"Shikui","family":"Wei","sequence":"additional","affiliation":[{"name":"Beijing Jiaotong University &amp; Beijing Key Laboratory of Advanced Information Science and Network Technology, Beijing, China"}]},{"given":"Huaxin","family":"Pang","sequence":"additional","affiliation":[{"name":"Beijing Jiaotong University &amp; Beijing Key Laboratory of Advanced Information Science and Network Technology, Beijing, China"}]},{"given":"Yao","family":"Zhao","sequence":"additional","affiliation":[{"name":"Beijing Jiaotong University &amp; Beijing Key Laboratory of Advanced Information Science and Network Technology, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2021,10,17]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00804"},{"key":"e_1_3_2_2_2_1","volume-title":"Bottom-Up and Top-Down Attention for Image Captioning and VQA. CoRR","author":"Anderson Peter","year":"2017"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"e_1_3_2_2_4_1","volume-title":"Manuel Montes y G\u00f3mez, and Fabio A. Gonz\u00e1lez","author":"Arevalo John","year":"2017"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.5555\/1886063.1886114"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00307"},{"key":"e_1_3_2_2_7_1","volume-title":"Referring Expression Object Segmentation with Caption-Aware Consistency. CoRR","author":"Chen Yi-Wen","year":"2019"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"crossref","unstructured":"Yen-Chun Chen Linjie Li Licheng Yu Ahmed El Kholy Faisal Ahmed Zhe Gan Yu Cheng and Jingjing Liu. 2020 b. UNITER: Learning UN iversal Image-TE xt Representations. https:\/\/openreview.net\/forum?id=S1eL4kBYwr  Yen-Chun Chen Linjie Li Licheng Yu Ahmed El Kholy Faisal Ahmed Zhe Gan Yu Cheng and Jingjing Liu. 2020 b. UNITER: Learning UN iversal Image-TE xt Representations. https:\/\/openreview.net\/forum?id=S1eL4kBYwr","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"e_1_3_2_2_9_1","volume-title":"Modality-Agnostic Attention Fusion for visual search with text feedback. arXiv preprint arXiv:2007.00145","author":"Dodds Eric","year":"2020"},{"key":"e_1_3_2_2_10_1","volume-title":"Belongie","author":"Forbes Maxwell","year":"2019"},{"key":"e_1_3_2_2_11_1","volume-title":"Huang","author":"Fu Yang","year":"2018"},{"key":"e_1_3_2_2_12_1","volume-title":"Daylen Yang, Anna Rohrbach, Trevor Darrell, and Marcus Rohrbach.","author":"Fukui Akira","year":"2016"},{"key":"e_1_3_2_2_13_1","volume-title":"Dynamic Fusion with Intra- and Inter- Modality Attention Flow for Visual Question Answering. CoRR","author":"Gao Peng","year":"2018"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46466-4_15"},{"key":"e_1_3_2_2_15_1","volume-title":"The Fashion IQ Dataset: Retrieving Images by Combining Side Information and Relative Natural Language Feedback. CoRR","author":"Guo Xiaoxiao","year":"2019"},{"key":"e_1_3_2_2_16_1","volume-title":"Davis","author":"Han Xintong","year":"2017"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_38"},{"key":"e_1_3_2_2_19_1","unstructured":"Alexander Hermans Lucas Beyer and Bastian Leibe. 2017. In Defense of the Triplet Loss for Person Re-Identification. CoRR Vol. abs\/1703.07737 (2017). arxiv: 1703.07737 http:\/\/arxiv.org\/abs\/1703.07737  Alexander Hermans Lucas Beyer and Bastian Leibe. 2017. In Defense of the Triplet Loss for Person Re-Identification. CoRR Vol. abs\/1703.07737 (2017). arxiv: 1703.07737 http:\/\/arxiv.org\/abs\/1703.07737"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00365"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00448"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298744"},{"key":"e_1_3_2_2_24_1","volume-title":"Learning to Describe Differences Between Pairs of Similar Images. CoRR","author":"Jhamtani Harsh","year":"2018"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.5555\/3157096.3157137"},{"key":"e_1_3_2_2_26_1","volume-title":"Woosang Lim, Jeonghee Kim, JungWoo Ha, and Byoung-Tak Zhang.","author":"Kim Jin-Hwa","year":"2016"},{"key":"e_1_3_2_2_27_1","volume-title":"Zemel","author":"Kiros Ryan","year":"2014"},{"key":"e_1_3_2_2_28_1","volume-title":"Stacked Cross Attention for Image-Text Matching. CoRR","author":"Lee Kuang-Huei","year":"2018"},{"key":"e_1_3_2_2_29_1","volume-title":"Unicoder-VL: A Universal Encoder for Vision and Language by Cross-modal Pre-training. CoRR","author":"Li Gen","year":"2019"},{"key":"e_1_3_2_2_30_1","volume-title":"Harmonious Attention Network for Person Re-Identification. CoRR","author":"Li Wei","year":"2018"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.124"},{"key":"e_1_3_2_2_32_1","volume-title":"ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks. CoRR","author":"Lu Jiasen","year":"2019"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.5555\/3157096.3157129"},{"key":"e_1_3_2_2_34_1","volume-title":"Spatial-Content Image Search in Complex Scenes. In 2020 IEEE Winter Conference on Applications of Computer Vision (WACV). 2492--2500","author":"Ma J.","year":"2020"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.125"},{"key":"e_1_3_2_2_36_1","volume-title":"Attributes as Operators. CoRR","author":"Nagarajan Tushar","year":"2018"},{"key":"e_1_3_2_2_37_1","volume-title":"Improved Fusion of Visual and Language Representations by Dense Symmetric Co-Attention for Visual Question Answering. CoRR","author":"Nguyen Duy-Kien","year":"2018"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.11"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"crossref","unstructured":"OM Parkhi A Vedaldi and A Zisserman. 2015. Deep face recognition. 1--12.  OM Parkhi A Vedaldi and A Zisserman. 2015. Deep face recognition. 1--12.","DOI":"10.5244\/C.29.41"},{"key":"e_1_3_2_2_40_1","volume-title":"Courville","author":"Perez Ethan","year":"2017"},{"key":"e_1_3_2_2_41_1","volume-title":"CNN Image Retrieval Learns from BoW: Unsupervised Fine-Tuning with Hard Examples. CoRR","author":"Radenovic Filip","year":"2016"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01228-1_46"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/76.718510"},{"key":"e_1_3_2_2_44_1","volume-title":"Lillicrap","author":"Santoro Adam","year":"2017"},{"key":"e_1_3_2_2_45_1","volume-title":"FaceNet: A Unified Embedding for Face Recognition and Clustering. CoRR","author":"Schroff Florian","year":"2015"},{"key":"e_1_3_2_2_46_1","volume-title":"Deep Relative Attributes. CoRR","author":"Souri Yaser","year":"2015"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.244"},{"key":"e_1_3_2_2_48_1","volume-title":"CoRR","author":"Vaswani Ashish","year":"2017"},{"key":"e_1_3_2_2_49_1","volume-title":"Show and Tell: A Neural Image Caption Generator. CoRR","author":"Vinyals Oriol","year":"2014"},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00660"},{"key":"e_1_3_2_2_51_1","volume-title":"Learning Deep Structure-Preserving Image-Text Embeddings. CoRR","author":"Wang Liwei","year":"2015"},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00586"},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46478-7_28"},{"key":"e_1_3_2_2_54_1","volume-title":"Attention-Aware Compositional Network for Person Re-identification. CoRR","author":"Xu Jing","year":"2018"},{"key":"e_1_3_2_2_55_1","volume-title":"Attend and Tell: Neural Image Caption Generation with Visual Attention. CoRR","author":"Xu Kelvin","year":"2015"},{"key":"e_1_3_2_2_56_1","volume-title":"Cross-Modal Self-Attention Network for Referring Image Segmentation. CoRR","author":"Ye Linwei","year":"2019"},{"key":"e_1_3_2_2_57_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_19"},{"key":"e_1_3_2_2_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.93"},{"key":"e_1_3_2_2_59_1","volume-title":"Multi-modal Factorized Bilinear Pooling with Co-Attention Learning for Visual Question Answering. CoRR","author":"Yu Zhou","year":"2017"},{"key":"e_1_3_2_2_60_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR).","author":"Zhang Qi"},{"key":"e_1_3_2_2_61_1","volume-title":"Structured Attentions for Visual Question Answering. CoRR","author":"Zhu Chen","year":"2071"}],"event":{"name":"MM '21: ACM Multimedia Conference","location":"Virtual Event China","acronym":"MM '21","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 29th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475659","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3474085.3475659","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:48:24Z","timestamp":1750193304000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475659"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,17]]},"references-count":61,"alternative-id":["10.1145\/3474085.3475659","10.1145\/3474085"],"URL":"https:\/\/doi.org\/10.1145\/3474085.3475659","relation":{},"subject":[],"published":{"date-parts":[[2021,10,17]]},"assertion":[{"value":"2021-10-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}