{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T18:50:44Z","timestamp":1772909444023,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":29,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680903","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:49Z","timestamp":1729925989000},"page":"2214-2222","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["NFT1000: A Cross-Modal Dataset For Non-Fungible Token Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-6156-1749","authenticated-orcid":false,"given":"Shuxun","family":"Wang","sequence":"first","affiliation":[{"name":"MAIS, Institute of Automation, CAS &amp; School of AI, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8441-0958","authenticated-orcid":false,"given":"Yunfei","family":"Lei","sequence":"additional","affiliation":[{"name":"Beijing University of Aeronautics and Astronautics, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5937-183X","authenticated-orcid":false,"given":"Ziqi","family":"Zhang","sequence":"additional","affiliation":[{"name":"MAIS, Institute of Automation, CAS, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9873-304X","authenticated-orcid":false,"given":"Wei","family":"Liu","sequence":"additional","affiliation":[{"name":"MAIS, Institute of Automation, CAS, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0439-2692","authenticated-orcid":false,"given":"Haowei","family":"Liu","sequence":"additional","affiliation":[{"name":"MAIS, Institute of Automation, CAS, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3410-7856","authenticated-orcid":false,"given":"Li","family":"Yang","sequence":"additional","affiliation":[{"name":"MAIS, Institute of Automation, CAS, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6114-1411","authenticated-orcid":false,"given":"Bing","family":"Li","sequence":"additional","affiliation":[{"name":"MAIS, Institute of Automation, CAS, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3936-0308","authenticated-orcid":false,"given":"Wenjuan","family":"Li","sequence":"additional","affiliation":[{"name":"MAIS, Institute of Automation, CAS, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8925-5215","authenticated-orcid":false,"given":"Jin","family":"Gao","sequence":"additional","affiliation":[{"name":"MAIS, Institute of Automation, CAS, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9237-8825","authenticated-orcid":false,"given":"Weiming","family":"Hu","sequence":"additional","affiliation":[{"name":"MAIS, Institute of Automation, CAS &amp; School of AI, University of Chinese Academy of Sciences, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Vitalik Buterin et al. 2014. A next-generation smart contract and decentralized application platform. white paper Vol. 3 37 (2014) 2--1."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01060"},{"key":"e_1_3_2_1_3_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_4_1","volume-title":"Devise: A deep visual-semantic embedding model. Advances in neural information processing systems","author":"Frome Andrea","year":"2013","unstructured":"Andrea Frome, Greg S Corrado, Jon Shlens, Samy Bengio, Jeff Dean, Marc'Aurelio Ranzato, and Tomas Mikolov. 2013. Devise: A deep visual-semantic embedding model. Advances in neural information processing systems, Vol. 26 (2013)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_6_1","volume-title":"Denoising diffusion probabilistic models. Advances in neural information processing systems","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems, Vol. 33 (2020), 6840--6851."},{"key":"e_1_3_2_1_7_1","volume-title":"Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685","author":"Hu Edward J","year":"2021","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2021. Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)."},{"key":"e_1_3_2_1_8_1","unstructured":"Yuqi Huo Manli Zhang Guangzhen Liu Haoyu Lu Yizhao Gao Guoxing Yang Jingyuan Wen Heng Zhang Baogui Xu Weihao Zheng et al. 2021. WenLan: Bridging vision and language by large-scale multi-modal pre-training. arXiv preprint arXiv:2103.06561 (2021)."},{"key":"e_1_3_2_1_9_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730--19742."},{"key":"e_1_3_2_1_10_1","volume-title":"International conference on machine learning. PMLR, 12888--12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888--12900."},{"key":"e_1_3_2_1_11_1","volume-title":"Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems, Vol. 34 (2021), 9694--9705."},{"key":"e_1_3_2_1_12_1","volume-title":"Proceedings, Part V 13","author":"Lin Tsung-Yi","year":"2014","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2014. Microsoft coco: Common objects in context. In Computer Vision--ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6--12, 2014, Proceedings, Part V 13. Springer, 740--755."},{"key":"e_1_3_2_1_13_1","volume-title":"Make web3. 0 connected","author":"Liu Zhuotao","year":"2021","unstructured":"Zhuotao Liu, Yangxi Xiang, Jian Shi, Peng Gao, Haoyu Wang, Xusheng Xiao, Bihan Wen, Qi Li, and Yih-Chun Hu. 2021. Make web3. 0 connected. IEEE transactions on dependable and secure computing, Vol. 19, 5 (2021), 2965--2981."},{"key":"e_1_3_2_1_14_1","volume-title":"Qingwen lin, and Daxin Jiang","author":"Ziyang","year":"2023","unstructured":"Ziyang luo, Pu Zhao, Can Xu, Xiubo Geng, Tao Shen, Chongyang Tao, Jing Ma, Qingwen lin, and Daxin Jiang. 2023. LexLIP: Lexicon-Bottlenecked Language-Image Pre-Training for Large-Scale Image-Text Retrieval. arxiv: 2302.02908 [cs.CV]"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1016\/S0016-0032(96)00063--4"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.3390\/encyclopedia2010031"},{"key":"e_1_3_2_1_17_1","volume-title":"Bitcoin: A peer-to-peer electronic cash system.","author":"Nakamoto Satoshi","year":"2008","unstructured":"Satoshi Nakamoto. 2008. Bitcoin: A peer-to-peer electronic cash system. (2008)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.303"},{"key":"e_1_3_2_1_19_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. arxiv: 2103.00020 [cs.CV]"},{"key":"e_1_3_2_1_20_1","first-page":"25278","article-title":"Laion-5b: An open large-scale dataset for training next generation image-text models","volume":"35","author":"Schuhmann Christoph","year":"2022","unstructured":"Christoph Schuhmann, Romain Beaumont, Richard Vencu, Cade Gordon, Ross Wightman, Mehdi Cherti, Theo Coombes, Aarush Katta, Clayton Mullis, Mitchell Wortsman, et al. 2022. Laion-5b: An open large-scale dataset for training next generation image-text models. Advances in Neural Information Processing Systems, Vol. 35 (2022), 25278--25294.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_21_1","volume-title":"EVA-CLIP: Improved Training Techniques for CLIP at Scale. arXiv preprint arXiv:2303.15389","author":"Sun Quan","year":"2023","unstructured":"Quan Sun, Yuxin Fang, Ledell Wu, Xinlong Wang, and Yue Cao. 2023. EVA-CLIP: Improved Training Techniques for CLIP at Scale. arXiv preprint arXiv:2303.15389 (2023)."},{"key":"e_1_3_2_1_22_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_23_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_24_1","volume-title":"Applications of a Kullback-Leibler divergence for comparing non-nested models. Statistical modelling","author":"Wang Chen-Pin","year":"2013","unstructured":"Chen-Pin Wang and Booil Jo. 2013. Applications of a Kullback-Leibler divergence for comparing non-nested models. Statistical modelling, Vol. 13, 5--6 (2013), 409--429."},{"key":"e_1_3_2_1_25_1","unstructured":"Qin Wang Rujia Li Qi Wang and Shiping Chen. 2021. Non-Fungible Token (NFT): Overview Evaluation Opportunities and Challenges. arxiv: 2105.07447 [cs.CR]"},{"key":"e_1_3_2_1_26_1","volume-title":"A survey on metaverse: Fundamentals, security, and privacy","author":"Wang Yuntao","year":"2022","unstructured":"Yuntao Wang, Zhou Su, Ning Zhang, Rui Xing, Dongxiao Liu, Tom H Luan, and Xuemin Shen. 2022. A survey on metaverse: Fundamentals, security, and privacy. IEEE Communications Surveys & Tutorials (2022)."},{"key":"e_1_3_2_1_27_1","volume-title":"Po-Yao Huang, Russell Howes, Vasu Sharma, Shang-Wen Li, Gargi Ghosh, Luke Zettlemoyer, and Christoph Feichtenhofer.","author":"Xu Hu","year":"2024","unstructured":"Hu Xu, Saining Xie, Xiaoqing Ellen Tan, Po-Yao Huang, Russell Howes, Vasu Sharma, Shang-Wen Li, Gargi Ghosh, Luke Zettlemoyer, and Christoph Feichtenhofer. 2024. Demystifying CLIP Data. arxiv: 2309.16671 [cs.CV] https:\/\/arxiv.org\/abs\/2309.16671"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413726"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1504\/IJWGS.2018.095647"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680903","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680903","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:33Z","timestamp":1750295853000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680903"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":29,"alternative-id":["10.1145\/3664647.3680903","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680903","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}