{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,3]],"date-time":"2025-12-03T18:07:25Z","timestamp":1764785245547,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"the National Key RD Program of China under Grant","award":["No. 2021ZD0110404"],"award-info":[{"award-number":["No. 2021ZD0110404"]}]},{"name":"the Key Research and Development Program of Shannxi","award":["No. 2021ZDLGY0106 and No. 2022ZDLGY0112"],"award-info":[{"award-number":["No. 2021ZDLGY0106 and No. 2022ZDLGY0112"]}]},{"name":"the National Natural Science Foundation of China","award":["No. 62271377, 62201407, and 61977052"],"award-info":[{"award-number":["No. 62271377, 62201407, and 61977052"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612207","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:30Z","timestamp":1698391650000},"page":"6510-6519","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":12,"title":["Knowledge Decomposition and Replay: A Novel Cross-modal Image-Text Retrieval Continual Learning Method"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3209-0456","authenticated-orcid":false,"given":"Rui","family":"Yang","sequence":"first","affiliation":[{"name":"Xidian University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4940-1211","authenticated-orcid":false,"given":"Shuang","family":"Wang","sequence":"additional","affiliation":[{"name":"Xidian University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7927-3037","authenticated-orcid":false,"given":"Huan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Xidian University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8528-5079","authenticated-orcid":false,"given":"Siyuan","family":"Xu","sequence":"additional","affiliation":[{"name":"Xidian University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-6144-5070","authenticated-orcid":false,"given":"YanHe","family":"Guo","sequence":"additional","affiliation":[{"name":"Xidian University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-0088-606X","authenticated-orcid":false,"given":"Xiutiao","family":"Ye","sequence":"additional","affiliation":[{"name":"Xidian University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1996-186X","authenticated-orcid":false,"given":"Biao","family":"Hou","sequence":"additional","affiliation":[{"name":"Xidian University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3354-9617","authenticated-orcid":false,"given":"Licheng","family":"Jiao","sequence":"additional","affiliation":[{"name":"Xidian University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.753"},{"key":"e_1_3_2_1_2_1","volume-title":"Computational principles of synaptic memory consolidation. Nature neuroscience","author":"Benna Marcus K","year":"2016","unstructured":"Marcus K Benna and Stefano Fusi. 2016. Computational principles of synaptic memory consolidation. Nature neuroscience, Vol. 19, 12 (2016), 1697--1706."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/279943.279962"},{"key":"e_1_3_2_1_4_1","first-page":"14879","article-title":"Coresets via bilevel optimization for continual learning and streaming","volume":"33","author":"Borsos Zal\u00e1n","year":"2020","unstructured":"Zal\u00e1n Borsos, Mojmir Mutny, and Andreas Krause. 2020. Coresets via bilevel optimization for continual learning and streaming. Advances in Neural Information Processing Systems, Vol. 33 (2020), 14879--14890.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_5_1","volume-title":"Philip HS Torr, and M Ranzato","author":"Chaudhry Arslan","year":"2019","unstructured":"Arslan Chaudhry, Marcus Rohrbach, Mohamed Elhoseiny, Thalaiyasingam Ajanthan, Puneet K Dokania, Philip HS Torr, and M Ranzato. 2019. Continual learning with tiny episodic memories. (2019)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/1553374.1553391"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01267"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3057446"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00814"},{"key":"e_1_3_2_1_10_1","volume-title":"Lifelong Machine Learning Workshop at ICML 2020","author":"Chiaro Riccardo Del","year":"2020","unstructured":"Riccardo Del Chiaro, Bart\u0142omiej Twardowski, Andrew D Bagdanov, and Joost van de Weijer. 2020. Ratt: Recurrent attention to transient tasks for continual image captioning. Lifelong Machine Learning Workshop at ICML 2020 (2020)."},{"key":"e_1_3_2_1_11_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_12_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58621-8_23"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.aiopen.2021.08.002"},{"key":"e_1_3_2_1_15_1","unstructured":"Geoffrey Hinton Oriol Vinyals Jeff Dean et al. 2015. Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531 Vol. 2 7 (2015)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00645"},{"key":"e_1_3_2_1_17_1","volume-title":"Pixel-bert: Aligning image pixels with text by deep multi-modal transformers. arXiv preprint arXiv:2004.00849","author":"Huang Zhicheng","year":"2020","unstructured":"Zhicheng Huang, Zhaoyang Zeng, Bei Liu, Dongmei Fu, and Jianlong Fu. 2020. Pixel-bert: Aligning image pixels with text by deep multi-modal transformers. arXiv preprint arXiv:2004.00849 (2020)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11595"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01052"},{"key":"e_1_3_2_1_20_1","volume-title":"International Conference on Machine Learning. PMLR, 4904--4916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In International Conference on Machine Learning. PMLR, 4904--4916."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_2_1_22_1","volume-title":"Deep fragment embeddings for bidirectional image sentence mapping. Advances in neural information processing systems","author":"Karpathy Andrej","year":"2014","unstructured":"Andrej Karpathy, Armand Joulin, and Li F Fei-Fei. 2014. Deep fragment embeddings for bidirectional image sentence mapping. Advances in neural information processing systems, Vol. 27 (2014)."},{"key":"e_1_3_2_1_23_1","volume-title":"International Conference on Machine Learning. PMLR, 5583--5594","author":"Kim Wonjae","year":"2021","unstructured":"Wonjae Kim, Bokyung Son, and Ildoo Kim. 2021. Vilt: Vision-and-language transformer without convolution or region supervision. In International Conference on Machine Learning. PMLR, 5583--5594."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1073\/pnas.1611835114"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"e_1_3_2_1_26_1","volume-title":"Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems, Vol. 34 (2021), 9694--9705."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00475"},{"key":"e_1_3_2_1_28_1","volume-title":"Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557","author":"Li Liunian Harold","year":"2019","unstructured":"Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, and Kai-Wei Chang. 2019a. Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557 (2019)."},{"key":"e_1_3_2_1_29_1","volume-title":"Oscar: Object-semantics aligned pre-training for vision-language tasks. In Computer Vision-ECCV 2020: 16th European Conference","author":"Li Xiujun","year":"2020","unstructured":"Xiujun Li, Xi Yin, Chunyuan Li, Pengchuan Zhang, Xiaowei Hu, Lei Zhang, Lijuan Wang, Houdong Hu, Li Dong, Furu Wei, et al. 2020. Oscar: Object-semantics aligned pre-training for vision-language tasks. In Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, August 23-28, 2020, Proceedings, Part XXX 16. Springer, 121--137."},{"key":"e_1_3_2_1_30_1","volume-title":"Learning without forgetting","author":"Li Zhizhong","year":"2017","unstructured":"Zhizhong Li and Derek Hoiem. 2017. Learning without forgetting. IEEE transactions on pattern analysis and machine intelligence, Vol. 40, 12 (2017), 2935--2947."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00019"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_33_1","volume-title":"Proceedings of the 31st International Conference on Neural Information Processing Systems. 6470--6479","author":"Lopez-Paz David","year":"2017","unstructured":"David Lopez-Paz and Marc'Aurelio Ranzato. 2017. Gradient episodic memory for continual learning. In Proceedings of the 31st International Conference on Neural Information Processing Systems. 6470--6479."},{"key":"e_1_3_2_1_34_1","volume-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1111\/j.1749-6632.1998.tb08212.x"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00138"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00511"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-020-01392-1"},{"key":"e_1_3_2_1_39_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.587"},{"key":"e_1_3_2_1_41_1","volume-title":"Advances in Neural Information Processing Systems","volume":"32","author":"Rolnick David","year":"2019","unstructured":"David Rolnick, Arun Ahuja, Jonathan Schwarz, Timothy Lillicrap, and Gregory Wayne. 2019. Experience replay for continual learning. Advances in Neural Information Processing Systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_42_1","volume-title":"Experience replay for continual learning. arXiv preprint arXiv:1811.11682","author":"Rolnick David","year":"2018","unstructured":"David Rolnick, Arun Ahuja, Jonathan Schwarz, Timothy P Lillicrap, and Greg Wayne. 2018. Experience replay for continual learning. arXiv preprint arXiv:1811.11682 (2018)."},{"key":"e_1_3_2_1_43_1","volume-title":"Progressive neural networks. arXiv preprint arXiv:1606.04671","author":"Rusu Andrei A","year":"2016","unstructured":"Andrei A Rusu, Neil C Rabinowitz, Guillaume Desjardins, Hubert Soyer, James Kirkpatrick, Koray Kavukcuoglu, Razvan Pascanu, and Raia Hadsell. 2016. Progressive neural networks. arXiv preprint arXiv:1606.04671 (2016)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1007\/BF03172815"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i11.17159"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00905"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00554"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.3002177"},{"volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops. 3628--3638","author":"Wang Kai","key":"e_1_3_2_1_49_1","unstructured":"Kai Wang, Luis Herranz, and Joost van de Weijer. 2021. Continual Learning in Cross-Modal Retrieval. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops. 3628--3638."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350875"},{"key":"e_1_3_2_1_51_1","first-page":"15173","article-title":"Supermasks in superposition","volume":"33","author":"Wortsman Mitchell","year":"2020","unstructured":"Mitchell Wortsman, Vivek Ramanujan, Rosanne Liu, Aniruddha Kembhavi, Mohammad Rastegari, Jason Yosinski, and Ali Farhadi. 2020. Supermasks in superposition. Advances in Neural Information Processing Systems, Vol. 33 (2020), 15173--15184.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548107"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2022.109888"},{"key":"e_1_3_2_1_54_1","volume-title":"Advances in Neural Information Processing Systems","volume":"31","author":"Xu Ju","year":"2018","unstructured":"Ju Xu and Zhanxing Zhu. 2018. Reinforced continual learning. Advances in Neural Information Processing Systems, Vol. 31 (2018)."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00025"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2022.3194076"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Ottawa ON Canada","acronym":"MM '23"},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612207","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612207","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:06:41Z","timestamp":1755821201000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612207"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":56,"alternative-id":["10.1145\/3581783.3612207","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612207","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}