{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T15:09:35Z","timestamp":1743088175661,"version":"3.40.3"},"publisher-location":"Cham","reference-count":49,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031730238"},{"type":"electronic","value":"9783031730245"}],"license":[{"start":{"date-parts":[[2024,11,24]],"date-time":"2024-11-24T00:00:00Z","timestamp":1732406400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,24]],"date-time":"2024-11-24T00:00:00Z","timestamp":1732406400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73024-5_16","type":"book-chapter","created":{"date-parts":[[2024,11,25]],"date-time":"2024-11-25T16:42:29Z","timestamp":1732552949000},"page":"261-278","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Learning to\u00a0Build by\u00a0Building Your Own Instructions"],"prefix":"10.1007","author":[{"given":"Aaron","family":"Walsman","sequence":"first","affiliation":[]},{"given":"Muru","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Adam","family":"Fishman","sequence":"additional","affiliation":[]},{"given":"Ali","family":"Farhadi","sequence":"additional","affiliation":[]},{"given":"Dieter","family":"Fox","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,24]]},"reference":[{"key":"16_CR1","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"717","DOI":"10.1007\/978-3-319-46478-7_44","volume-title":"Computer Vision \u2013 ECCV 2016","author":"A Bulat","year":"2016","unstructured":"Bulat, A., Tzimiropoulos, G.: Human pose estimation via convolutional part heatmap regression. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9911, pp. 717\u2013732. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46478-7_44"},{"key":"16_CR2","doi-asserted-by":"crossref","unstructured":"Chang, M., et\u00a0al.: Goat: go to any thing. arXiv preprint arXiv:2311.06430 (2023)","DOI":"10.15607\/RSS.2024.XX.073"},{"key":"16_CR3","unstructured":"Chen, Q., Memmel, M., Fang, A., Walsman, A., Fox, D., Gupta, A.: Urdformer: constructing interactive realistic scenes from real images via simulation and generative modeling. In: Towards Generalist Robots: Learning Paradigms for Scalable Skill Acquisition@ CoRL2023 (2023)"},{"key":"16_CR4","doi-asserted-by":"crossref","unstructured":"Cho, K., Van\u00a0Merri\u00ebnboer, B., Bahdanau, D., Bengio, Y.: On the properties of neural machine translation: encoder-decoder approaches. arXiv preprint arXiv:1409.1259 (2014)","DOI":"10.3115\/v1\/W14-4012"},{"issue":"2","key":"16_CR5","doi-asserted-by":"publisher","first-page":"125","DOI":"10.1109\/70.928558","volume":"17","author":"H Choset","year":"2001","unstructured":"Choset, H., Nagatani, K.: Topological simultaneous localization and mapping (slam): toward exact localization without explicit localization. IEEE Trans. Robot. Autom. 17(2), 125\u2013137 (2001)","journal-title":"IEEE Trans. Robot. Autom."},{"key":"16_CR6","first-page":"5745","volume":"34","author":"H Chung","year":"2021","unstructured":"Chung, H., et al.: Brick-by-brick: combinatorial construction with deep reinforcement learning. Adv. Neural. Inf. Process. Syst. 34, 5745\u20135757 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"16_CR7","unstructured":"Czarnecki, W.M., Pascanu, R., Osindero, S., Jayakumar, S., Swirszcz, G., Jaderberg, M.: Distilling policy distillation. In: The 22nd International Conference on Artificial Intelligence and Statistics, pp. 1331\u20131340. PMLR (2019)"},{"key":"16_CR8","doi-asserted-by":"crossref","unstructured":"Deitke, M., et\u00a0al.: Robothor: an open simulation-to-real embodied ai platform. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3164\u20133174 (2020)","DOI":"10.1109\/CVPR42600.2020.00323"},{"key":"16_CR9","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16$$\\times $$16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"issue":"6","key":"16_CR10","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3272127.3275006","volume":"37","author":"T Du","year":"2018","unstructured":"Du, T., et al.: Inversecsg: automatic conversion of 3d models to csg trees. ACM Trans. Graph. (TOG) 37(6), 1\u201316 (2018)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"16_CR11","unstructured":"Fan, L., et al.: Minedojo: building open-ended embodied agents with internet-scale knowledge. In: Thirty-sixth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (2022). https:\/\/openreview.net\/forum?id=rc8o_j8I8PX"},{"key":"16_CR12","unstructured":"Gordon, D., Fox, D., Farhadi, A.: What should i do now? marrying reinforcement learning and symbolic planning. arXiv preprint arXiv:1901.01492 (2019)"},{"key":"16_CR13","unstructured":"Graves, A., Wayne, G., Danihelka, I.: Neural turing machines. arXiv preprint arXiv:1410.5401 (2014)"},{"key":"16_CR14","doi-asserted-by":"crossref","unstructured":"Gupta, A., Fox, D., Curless, B., Cohen, M.: Duplotrack: a real-time system for authoring and guiding duplo block assembly. In: Proceedings of the 25th Annual ACM Symposium on User Interface Software and Technology, pp. 389\u2013402 (2012)","DOI":"10.1145\/2380116.2380167"},{"issue":"5","key":"16_CR15","doi-asserted-by":"publisher","first-page":"647","DOI":"10.1177\/0278364911434148","volume":"31","author":"P Henry","year":"2012","unstructured":"Henry, P., Krainin, M., Herbst, E., Ren, X., Fox, D.: Rgb-d mapping: using kinect-style depth cameras for dense 3d modeling of indoor environments. Int. J. Rob. Res. 31(5), 647\u2013663 (2012)","journal-title":"Int. J. Rob. Res."},{"issue":"8","key":"16_CR16","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"issue":"6","key":"16_CR17","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3478513.3480562","volume":"40","author":"B Jones","year":"2021","unstructured":"Jones, B., Hildreth, D., Chen, D., Baran, I., Kim, V.G., Schulz, A.: Automate: a dataset and learning approach for automatic mating of cad assemblies. ACM Trans. Graph. (TOG) 40(6), 1\u201318 (2021)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"16_CR18","doi-asserted-by":"crossref","unstructured":"Jordan, M.I.: Serial order: a parallel distributed processing approach. In: Advances in Psychology, vol.\u00a0121, pp. 471\u2013495. Elsevier (1997)","DOI":"10.1016\/S0166-4115(97)80111-2"},{"key":"16_CR19","unstructured":"Kim, J.W., Kang, K.K., Lee, J.H.: Survey on automated lego assembly construction (2014)"},{"key":"16_CR20","doi-asserted-by":"crossref","unstructured":"Lee, S., Kim, J., Kim, J.W., Moon, B.R.: Finding an optimal lego\u00ae brick layout of voxelized 3d object using a genetic algorithm. In: Proceedings of the 2015 Annual Conference on Genetic and Evolutionary Computation, pp. 1215\u20131222 (2015)","DOI":"10.1145\/2739480.2754667"},{"key":"16_CR21","doi-asserted-by":"crossref","unstructured":"Lee, Y., Hu, E.S., Lim, J.J.: Ikea furniture assembly environment for long-horizon complex manipulation tasks. In: 2021 IEEE International Conference on Robotics and Automation (ICRA), pp. 6343\u20136349. IEEE (2021)","DOI":"10.1109\/ICRA48506.2021.9560986"},{"key":"16_CR22","unstructured":"Lennon, K., et al.: Image2lego: customized lego set generation from images. arXiv preprint arXiv:2108.08477 (2021)"},{"key":"16_CR23","doi-asserted-by":"crossref","unstructured":"Leonard, J.J., Durrant-Whyte, H.F.: Simultaneous map building and localization for an autonomous mobile robot. In: IROS, vol.\u00a03, pp. 1442\u20131447 (1991)","DOI":"10.1109\/IROS.1991.174711"},{"issue":"6","key":"16_CR24","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3414685.3417763","volume":"39","author":"C Li","year":"2020","unstructured":"Li, C., Pan, H., Bousseau, A., Mitra, N.J.: Sketch2cad: sequential cad modeling by sketching in context. ACM Trans. Graph. (TOG) 39(6), 1\u201314 (2020)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"16_CR25","doi-asserted-by":"crossref","unstructured":"Li, K., Bian, J.W., Castle, R., Torr, P.H., Prisacariu, V.A.: Mobilebrick: building lego for 3d reconstruction on mobile devices. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4892\u20134901 (2023)","DOI":"10.1109\/CVPR52729.2023.00474"},{"key":"16_CR26","doi-asserted-by":"crossref","unstructured":"Lim, J.J., Pirsiavash, H., Torralba, A.: Parsing ikea objects: fine pose estimation. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2992\u20132999 (2013)","DOI":"10.1109\/ICCV.2013.372"},{"key":"16_CR27","doi-asserted-by":"crossref","unstructured":"Mo, K., et al.: Structurenet: hierarchical graph networks for 3d shape generation. arXiv preprint arXiv:1908.00575 (2019)","DOI":"10.1145\/3355089.3356527"},{"key":"16_CR28","doi-asserted-by":"crossref","unstructured":"Niu, C., Li, J., Xu, K.: Im2struct: recovering 3d shape structure from a single rgb image. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4521\u20134529 (2018)","DOI":"10.1109\/CVPR.2018.00475"},{"issue":"2","key":"16_CR29","first-page":"155","volume":"17","author":"M Peysakhov","year":"2003","unstructured":"Peysakhov, M., Regli, W.C.: Using assembly representations to enable evolutionary design of lego structures. Ai Edam 17(2), 155\u2013168 (2003)","journal-title":"Ai Edam"},{"key":"16_CR30","doi-asserted-by":"crossref","unstructured":"Ranftl, R., Bochkovskiy, A., Koltun, V.: Vision transformers for dense prediction. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 12179\u201312188 (2021)","DOI":"10.1109\/ICCV48922.2021.01196"},{"key":"16_CR31","unstructured":"Ross, S., Gordon, G., Bagnell, D.: A reduction of imitation learning and structured prediction to no-regret online learning. In: Proceedings of the Fourteenth International Conference on Artificial Intelligence and Statistics, pp. 627\u2013635. JMLR Workshop and Conference Proceedings (2011)"},{"key":"16_CR32","doi-asserted-by":"crossref","unstructured":"Rumelhart, D.E., Hinton, G.E., Williams, R.J., et\u00a0al.: Learning internal representations by error propagation (1985)","DOI":"10.21236\/ADA164453"},{"key":"16_CR33","doi-asserted-by":"crossref","unstructured":"Savva, M., et\u00a0al.: Habitat: a platform for embodied ai research. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9339\u20139347 (2019)","DOI":"10.1109\/ICCV.2019.00943"},{"key":"16_CR34","unstructured":"Shen, B., et\u00a0al.: igibson, a simulation environment for interactive tasks in large realisticscenes. arXiv preprint arXiv:2012.02924 (2020)"},{"key":"16_CR35","doi-asserted-by":"crossref","unstructured":"Su\u00e1rez-Ruiz, F., Zhou, X., Pham, Q.C.: Can robots assemble an ikea chair? Sci. Rob. 3(17), eaat6385 (2018)","DOI":"10.1126\/scirobotics.aat6385"},{"key":"16_CR36","unstructured":"Szot, A., et\u00a0al.: Habitat 2.0: training home assistants to rearrange their habitat. Adv. Neural Inf. Process. Syst. 34, 251\u2013266 (2021)"},{"key":"16_CR37","unstructured":"Thompson, R., Ghalebi, E., DeVries, T., Taylor, G.W.: Building lego using deep generative models of graphs. arXiv preprint arXiv:2012.11543 (2020)"},{"key":"16_CR38","doi-asserted-by":"crossref","unstructured":"Tian, Y., et al.: Assemble them all: physics-based planning for generalizable assembly by disassembly. ACM Trans. Graph. 41(6) (2022)","DOI":"10.1145\/3550454.3555525"},{"key":"16_CR39","unstructured":"Vaswani, A., et al.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"issue":"7782","key":"16_CR40","doi-asserted-by":"publisher","first-page":"350","DOI":"10.1038\/s41586-019-1724-z","volume":"575","author":"O Vinyals","year":"2019","unstructured":"Vinyals, O., et al.: Grandmaster level in starcraft ii using multi-agent reinforcement learning. Nature 575(7782), 350\u2013354 (2019)","journal-title":"Nature"},{"key":"16_CR41","doi-asserted-by":"publisher","unstructured":"Walsman, A., Zhang, M., Kotar, K., Desingh, K., Farhadi, A., Fox, D.: Break and make: interactive structural understanding using lego bricks. In: European Conference on Computer Vision, pp. 90\u2013107. Springer, Heidelberg (2022). https:\/\/doi.org\/10.1007\/978-3-031-19815-1_6","DOI":"10.1007\/978-3-031-19815-1_6"},{"key":"16_CR42","doi-asserted-by":"publisher","unstructured":"Wang, R., Zhang, Y., Mao, J., Cheng, C.Y., Wu, J.: Translating a visual lego manual to a machine-executable plan. In: European Conference on Computer Vision, pp. 677\u2013694. Springer, Heidelberg (2022). https:\/\/doi.org\/10.1007\/978-3-031-19836-6_38","DOI":"10.1007\/978-3-031-19836-6_38"},{"key":"16_CR43","unstructured":"Wani, S., Patel, S., Jain, U., Chang, A.X., Savva, M.: Multi-on: benchmarking semantic map memory using multi-object navigation. In: Neural Information Processing Systems (NeurIPS) (2020)"},{"issue":"4","key":"16_CR44","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3450626.3459818","volume":"40","author":"KD Willis","year":"2021","unstructured":"Willis, K.D., et al.: Fusion 360 gallery: a dataset and environment for programmatic cad construction from human design sequences. ACM Trans. Graph. (TOG) 40(4), 1\u201324 (2021)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"16_CR45","doi-asserted-by":"crossref","unstructured":"Xu, X., Peng, W., Cheng, C.Y., Willis, K.D., Ritchie, D.: Inferring cad modeling sequences using zone graphs. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6062\u20136070 (2021)","DOI":"10.1109\/CVPR46437.2021.00600"},{"key":"16_CR46","unstructured":"Yan, C., Misra, D., Bennnett, A., Walsman, A., Bisk, Y., Artzi, Y.: Chalet: cornell house agent learning environment. arXiv preprint arXiv:1801.07357 (2018)"},{"key":"16_CR47","doi-asserted-by":"crossref","unstructured":"Zakka, K., Zeng, A., Lee, J., Song, S.: Form2fit: learning shape priors for generalizable assembly from disassembly. In: 2020 IEEE International Conference on Robotics and Automation (ICRA), pp. 9404\u20139410. IEEE (2020)","DOI":"10.1109\/ICRA40945.2020.9196733"},{"key":"16_CR48","first-page":"6315","volume":"33","author":"G Zhan","year":"2020","unstructured":"Zhan, G., et al.: Generative 3d part assembly via dynamic graph learning. Adv. Neural. Inf. Process. Syst. 33, 6315\u20136326 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"16_CR49","doi-asserted-by":"crossref","unstructured":"Zhang, J., Cherian, A., Liu, Y., Ben-Shabat, Y., Rodriguez, C., Gould, S.: Aligning step-by-step instructional diagrams to video demonstrations. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2483\u20132492 (2023)","DOI":"10.1109\/CVPR52729.2023.00245"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73024-5_16","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,25]],"date-time":"2024-11-25T17:08:45Z","timestamp":1732554525000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73024-5_16"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,24]]},"ISBN":["9783031730238","9783031730245"],"references-count":49,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73024-5_16","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,11,24]]},"assertion":[{"value":"24 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}