{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,14]],"date-time":"2025-11-14T07:33:42Z","timestamp":1763105622885,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","license":[{"start":{"date-parts":[[2018,10,15]],"date-time":"2018-10-15T00:00:00Z","timestamp":1539561600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Fundamental Research Project in the Science and Technology Plan of Shenzhen","award":["JCYJ20160331114551175"],"award-info":[{"award-number":["JCYJ20160331114551175"]}]},{"name":"National Natural Science Foundation of China","award":["61602314"],"award-info":[{"award-number":["61602314"]}]},{"name":"Natural Science Foundation of Guangdong Province of China","award":["2016A030313043"],"award-info":[{"award-number":["2016A030313043"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2018,10,15]]},"DOI":"10.1145\/3240508.3240525","type":"proceedings-article","created":{"date-parts":[[2018,10,18]],"date-time":"2018-10-18T17:52:08Z","timestamp":1539885128000},"page":"99-107","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":75,"title":["Attention-based Pyramid Aggregation Network for Visual Place Recognition"],"prefix":"10.1145","author":[{"given":"Yingying","family":"Zhu","sequence":"first","affiliation":[{"name":"Shenzhen University, Shenzhen, China"}]},{"given":"Jiong","family":"Wang","sequence":"additional","affiliation":[{"name":"Shenzhen University, Shenzhen, China"}]},{"given":"Lingxi","family":"Xie","sequence":"additional","affiliation":[{"name":"Johns Hopkins University, Baltimore, MD, USA"}]},{"given":"Liang","family":"Zheng","sequence":"additional","affiliation":[{"name":"Australian National University, Canberra, Australia"}]}],"member":"320","published-online":{"date-parts":[[2018,10,15]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/2001269.2001293"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Relja Arandjelovi\u0107 Petr Gronat Akihiko Torii Tomas Pajdla and Josef Sivic. 2016. NetVLAD: CNN architecture for weakly supervised place recognition. In CVPR. 5297--5307.  Relja Arandjelovi\u0107 Petr Gronat Akihiko Torii Tomas Pajdla and Josef Sivic. 2016. NetVLAD: CNN architecture for weakly supervised place recognition. In CVPR. 5297--5307.","DOI":"10.1109\/CVPR.2016.572"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.207"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Relja Arandjelovi\u0107 and Andrew Zisserman. 2014. DisLocation: Scalable descriptor distinctiveness for location recognition. In ACCV. 188--204.  Relja Arandjelovi\u0107 and Andrew Zisserman. 2014. DisLocation: Scalable descriptor distinctiveness for location recognition. In ACCV. 188--204.","DOI":"10.1007\/978-3-319-16817-3_13"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.150"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Artem Babenko Anton Slesarev Alexandr Chigorin and Victor Lempitsky. 2014. Neural codes for image retrieval. In ECCV. 584--599.  Artem Babenko Anton Slesarev Alexandr Chigorin and Victor Lempitsky. 2014. Neural codes for image retrieval. In ECCV. 584--599.","DOI":"10.1007\/978-3-319-10590-1_38"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISMAR.2009.5336472"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2011.5995601"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Ond\u0159ej Chum James Philbin Josef Sivic Michael Isard and Andrew Zisserman. 2007. Total recall: Automatic query expansion with a generative feature model for object retrieval. In ICCV. 1--8.  Ond\u0159ej Chum James Philbin Josef Sivic Michael Isard and Andrew Zisserman. 2007. Total recall: Automatic query expansion with a generative feature model for object retrieval. In ICCV. 1--8.","DOI":"10.1109\/ICCV.2007.4408891"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2011.5995626"},{"key":"e_1_3_2_1_11_1","unstructured":"Xavier Glorot and Yoshua Bengio. 2010. Understanding the difficulty of training deep feedforward neural networks. In Aistats. 249--256.  Xavier Glorot and Yoshua Bengio. 2010. Understanding the difficulty of training deep feedforward neural networks. In Aistats. 249--256."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Albert Gordo Jon Almaz\u00e1n Jerome Revaud and Diane Larlus. 2016. Deep image retrieval: Learning global representations for image search. In ECCV. 241--257.  Albert Gordo Jon Almaz\u00e1n Jerome Revaud and Diane Larlus. 2016. Deep image retrieval: Learning global representations for image search. In ECCV. 241--257.","DOI":"10.1007\/978-3-319-46466-4_15"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2005.239"},{"key":"e_1_3_2_1_14_1","volume-title":"Efros","author":"Hays James","year":"2008","unstructured":"James Hays and Alexei A . Efros . 2008 . IM2GPS: estimating geographic information from a single image. In CVPR. 1--8. James Hays and Alexei A. Efros. 2008. IM2GPS: estimating geographic information from a single image. In CVPR. 1--8."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2014. Spatial pyramid pooling in deep convolutional networks for visual recognition. In ECCV. 346--361.  Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2014. Spatial pyramid pooling in deep convolutional networks for visual recognition. In ECCV. 346--361.","DOI":"10.1007\/978-3-319-10578-9_23"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123417"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"Elad Hoffer and Nir Ailon. 2015. Deep metric learning using triplet network. In SIABAD. 84--92.  Elad Hoffer and Nir Ailon. 2015. Deep metric learning using triplet network. In SIABAD. 84--92.","DOI":"10.1007\/978-3-319-24261-3_7"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","unstructured":"Jie Hu Li Shen and Gang Sun. 2018. Squeeze-and-Excitation Networks. CVPR.  Jie Hu Li Shen and Gang Sun. 2018. Squeeze-and-Excitation Networks. CVPR.","DOI":"10.1109\/CVPR.2018.00745"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"Herv\u00e9 J\u00e9gou and Ond\u0159ej Chum. 2012. Negative evidences and co-occurences in image retrieval: The benefit of PCA and whitening. ECCV 774--787.  Herv\u00e9 J\u00e9gou and Ond\u0159ej Chum. 2012. Negative evidences and co-occurences in image retrieval: The benefit of PCA and whitening. ECCV 774--787.","DOI":"10.1007\/978-3-642-33709-3_55"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Herv\u00e9 J\u00e9gou Matthijs Douze and Cordelia Schmid. 2009. On the burstiness of visual elements. In CVPR. 1169--1176.  Herv\u00e9 J\u00e9gou Matthijs Douze and Cordelia Schmid. 2009. On the burstiness of visual elements. In CVPR. 1169--1176.","DOI":"10.1109\/CVPR.2009.5206609"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2011.235"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Albert Jim\u00e9nez Jose M. Alvarez and Xavier Gir\u00f3 Nieto. 2017. Class-weighted convolutional features for visual instance search. In BMVC. 1--12.  Albert Jim\u00e9nez Jose M. Alvarez and Xavier Gir\u00f3 Nieto. 2017. Class-weighted convolutional features for visual instance search. In BMVC. 1--12.","DOI":"10.5244\/C.31.144"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"crossref","unstructured":"Yannis Kalantidis Clayton Mellina and Simon Osindero. 2016. Cross-dimensional weighting for aggregated deep convolutional features. In ECCV. 685--701.  Yannis Kalantidis Clayton Mellina and Simon Osindero. 2016. Cross-dimensional weighting for aggregated deep convolutional features. In ECCV. 685--701.","DOI":"10.1007\/978-3-319-46604-0_48"},{"key":"e_1_3_2_1_24_1","unstructured":"Hyo Jin Kim Enrique Dunn and Jan-Michael Frahm. 2017. Learned contextual feature reweighting for image geo-localization. In CVPR. 2136--2145.  Hyo Jin Kim Enrique Dunn and Jan-Michael Frahm. 2017. Learned contextual feature reweighting for image geo-localization. In CVPR. 2136--2145."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"Jan Knopp Josef Sivic and Tomas Pajdla. 2010. Avoiding confusing features in place recognition. ECCV 748--761.   Jan Knopp Josef Sivic and Tomas Pajdla. 2010. Avoiding confusing features in place recognition. ECCV 748--761.","DOI":"10.1007\/978-3-642-15549-9_54"},{"key":"e_1_3_2_1_26_1","volume-title":"Hinton","author":"Krizhevsky Alex","year":"2012","unstructured":"Alex Krizhevsky , Ilya Sutskever , and Geoffrey E . Hinton . 2012 . Imagenet classification with deep convolutional neural networks. In NIPS. 1097--1105. Alex Krizhevsky, Ilya Sutskever, and Geoffrey E. Hinton. 2012. Imagenet classification with deep convolutional neural networks. In NIPS. 1097--1105."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2006.68"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1023\/B:VISI.0000029664.99615.94"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Colin McManus Winston Churchill Will Maddern Alexander D. Stewart and Paul Newman. 2014. Shady dealings: Robust long-term visual localisation using illumination invariance. In ICRA. 901--906.  Colin McManus Winston Churchill Will Maddern Alexander D. Stewart and Paul Newman. 2014. Shady dealings: Robust long-term visual localisation using illumination invariance. In ICRA. 901--906.","DOI":"10.1109\/ICRA.2014.6906961"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"Sven Middelberg Torsten Sattler Ole Untzelmann and Leif Kobbelt. 2014. Scalable 6-dof localization on mobile devices. In ECCV. 268--283.  Sven Middelberg Torsten Sattler Ole Untzelmann and Leif Kobbelt. 2014. Scalable 6-dof localization on mobile devices. In ECCV. 268--283.","DOI":"10.1007\/978-3-319-10605-2_18"},{"key":"e_1_3_2_1_31_1","volume-title":"Xavier Giro-i Nieto, and Noel E. O'Connor","author":"Mohedano Eva","year":"2017","unstructured":"Eva Mohedano , Kevin McGuinness , Xavier Giro-i Nieto, and Noel E. O'Connor . 2017 . Saliency Weighted Convolutional Features for Instance Search . arXiv preprint arXiv:1711.10795 (2017). Eva Mohedano, Kevin McGuinness, Xavier Giro-i Nieto, and Noel E. O'Connor. 2017. Saliency Weighted Convolutional Features for Instance Search. arXiv preprint arXiv:1711.10795 (2017)."},{"key":"e_1_3_2_1_32_1","unstructured":"Hyeonwoo Noh Andre Araujo Jack Sim Tobias Weyand and Bohyung Han. 2017. Largescale image retrieval with attentive deep local features. In ICCV. 3456--3465.  Hyeonwoo Noh Andre Araujo Jack Sim Tobias Weyand and Bohyung Han. 2017. Largescale image retrieval with attentive deep local features. In ICCV. 3456--3465."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"James Philbin Ondrej Chum Michael Isard Josef Sivic and Andrew Zisserman. 2007. Object retrieval with large vocabularies and fast spatial matching. In CVPR. 1--8.  James Philbin Ondrej Chum Michael Isard Josef Sivic and Andrew Zisserman. 2007. Object retrieval with large vocabularies and fast spatial matching. In CVPR. 1--8.","DOI":"10.1109\/CVPR.2007.383172"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"crossref","unstructured":"James Philbin Ondrej Chum Michael Isard Josef Sivic and Andrew Zisserman. 2008. Lost in quantization: Improving particular object retrieval in large scale image databases. In CVPR. 1--8.  James Philbin Ondrej Chum Michael Isard Josef Sivic and Andrew Zisserman. 2008. Lost in quantization: Improving particular object retrieval in large scale image databases. In CVPR. 1--8.","DOI":"10.1109\/CVPR.2008.4587635"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"crossref","unstructured":"Filip Radenovi\u0107 Giorgos Tolias and Ondvrej Chum. 2016. CNN image retrieval learns from BoW: Unsupervised fine-tuning with hard examples. In ECCV. 3--20.  Filip Radenovi\u0107 Giorgos Tolias and Ondvrej Chum. 2016. CNN image retrieval learns from BoW: Unsupervised fine-tuning with hard examples. In ECCV. 3--20.","DOI":"10.1007\/978-3-319-46448-0_1"},{"key":"e_1_3_2_1_36_1","volume-title":"Facenet: A unified embedding for face recognition and clustering. In CVPR. 815--823.","author":"Schroff Florian","year":"2015","unstructured":"Florian Schroff , Dmitry Kalenichenko , and James Philbin . 2015 . Facenet: A unified embedding for face recognition and clustering. In CVPR. 815--823. Florian Schroff, Dmitry Kalenichenko, and James Philbin. 2015. Facenet: A unified embedding for face recognition and clustering. In CVPR. 815--823."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2014.131"},{"key":"e_1_3_2_1_38_1","first-page":"251","article-title":"Visual instance retrieval with deep convolutional networks","volume":"4","author":"Razavian Ali Sharif","year":"2016","unstructured":"Ali Sharif Razavian , Josephine Sullivan , Stefan Carlsson , and Atsuto Maki . 2016 . Visual instance retrieval with deep convolutional networks . ITE Trans. MTA Vol. 4 , 3 (2016), 251 -- 258 . Ali Sharif Razavian, Josephine Sullivan, Stefan Carlsson, and Atsuto Maki. 2016. Visual instance retrieval with deep convolutional networks. ITE Trans. MTA Vol. 4, 3 (2016), 251--258.","journal-title":"ITE Trans. MTA"},{"key":"e_1_3_2_1_39_1","unstructured":"Karen Simonyan and Andrew Zisserman. 2015. Very deep convolutional networks for large-scale image recognition. In ICLR.  Karen Simonyan and Andrew Zisserman. 2015. Very deep convolutional networks for large-scale image recognition. In ICLR."},{"key":"e_1_3_2_1_40_1","volume-title":"Video Google: A text retrieval approach to object matching in videos. In ICCV. 1470--1477.","author":"Sivic Josef","year":"2003","unstructured":"Josef Sivic and Andrew Zisserman . 2003 . Video Google: A text retrieval approach to object matching in videos. In ICCV. 1470--1477. Josef Sivic and Andrew Zisserman. 2003. Video Google: A text retrieval approach to object matching in videos. In ICCV. 1470--1477."},{"key":"e_1_3_2_1_41_1","unstructured":"Giorgos Tolias Ronan Sicre and Herv\u00e9 J\u00e9gou. 2016. Particular object retrieval with integral max-pooling of CNN activations. In ICLR.  Giorgos Tolias Ronan Sicre and Herv\u00e9 J\u00e9gou. 2016. Particular object retrieval with integral max-pooling of CNN activations. In ICLR."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","unstructured":"Akihiko Torii Relja Arandjelovi\u0107 Josef Sivic Masatoshi Okutomi and Tomas Pajdla. 2015. 24\/7 place recognition by view synthesis. In CVPR. 1808--1817.  Akihiko Torii Relja Arandjelovi\u0107 Josef Sivic Masatoshi Okutomi and Tomas Pajdla. 2015. 24\/7 place recognition by view synthesis. In CVPR. 1808--1817.","DOI":"10.1109\/CVPR.2015.7298790"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.119"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"crossref","unstructured":"Fei Wang Mengqing Jiang Chen Qian Shuo Yang Cheng Li Honggang Zhang Xiaogang Wang and Xiaoou Tang. 2017. Residual Attention Network for Image Classification. In CVPR. 3156--3164.  Fei Wang Mengqing Jiang Chen Qian Shuo Yang Cheng Li Honggang Zhang Xiaogang Wang and Xiaoou Tang. 2017. Residual Attention Network for Image Classification. In CVPR. 3156--3164.","DOI":"10.1109\/CVPR.2017.683"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.180"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"crossref","unstructured":"Tiantian Wang Ali Borji Lihe Zhang Pingping Zhang and Huchuan Lu. 2017. A Stagewise Refinement Model for Detecting Salient Objects in Images. In CVPR. 4019--4028.  Tiantian Wang Ali Borji Lihe Zhang Pingping Zhang and Huchuan Lu. 2017. A Stagewise Refinement Model for Detecting Salient Objects in Images. In CVPR. 4019--4028.","DOI":"10.1109\/ICCV.2017.433"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"crossref","unstructured":"Tianjun Xiao Yichong Xu Kuiyuan Yang Jiaxing Zhang Yuxin Peng and Zheng Zhang. 2015. The application of two-level attention models in deep convolutional neural network for fine-grained image classification. In CVPR. 842--850.  Tianjun Xiao Yichong Xu Kuiyuan Yang Jiaxing Zhang Yuxin Peng and Zheng Zhang. 2015. The application of two-level attention models in deep convolutional neural network for fine-grained image classification. In CVPR. 842--850.","DOI":"10.1109\/CVPR.2015.7298685"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123358"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"crossref","unstructured":"Jiaolong Yang Peiran Ren Dongqing Zhang Dong Chen Fang Wen Hongdong Li and Gang Hua. 2017. Neural Aggregation Network for Video Face Recognition. In CVPR. 2492--2495.  Jiaolong Yang Peiran Ren Dongqing Zhang Dong Chen Fang Wen Hongdong Li and Gang Hua. 2017. Neural Aggregation Network for Video Face Recognition. In CVPR. 2492--2495.","DOI":"10.1109\/CVPR.2017.554"},{"key":"e_1_3_2_1_50_1","unstructured":"Hengshuang Zhao Jianping Shi Xiaojuan Qi Xiaogang Wang and Jiaya Jia. 2017. Pyramid scene parsing network. In CVPR. 2881--2890.  Hengshuang Zhao Jianping Shi Xiaojuan Qi Xiaogang Wang and Jiaya Jia. 2017. Pyramid scene parsing network. In CVPR. 2881--2890."},{"key":"e_1_3_2_1_51_1","volume-title":"SIFT meets CNN: A decade survey of instance retrieval. TPAMI","author":"Zheng Liang","year":"2017","unstructured":"Liang Zheng , Yi Yang , and Qi Tian . 2017. SIFT meets CNN: A decade survey of instance retrieval. TPAMI ( 2017 ). Liang Zheng, Yi Yang, and Qi Tian. 2017. SIFT meets CNN: A decade survey of instance retrieval. TPAMI (2017)."}],"event":{"name":"MM '18: ACM Multimedia Conference","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Seoul Republic of Korea","acronym":"MM '18"},"container-title":["Proceedings of the 26th ACM international conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3240508.3240525","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3240508.3240525","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T00:44:01Z","timestamp":1750207441000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3240508.3240525"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,10,15]]},"references-count":51,"alternative-id":["10.1145\/3240508.3240525","10.1145\/3240508"],"URL":"https:\/\/doi.org\/10.1145\/3240508.3240525","relation":{},"subject":[],"published":{"date-parts":[[2018,10,15]]},"assertion":[{"value":"2018-10-15","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}