five

DannHiroaki/CMAB-Spatial-Join-0.08B

收藏
Hugging Face2026-01-26 更新2026-03-29 收录
下载链接:
https://hf-mirror.com/datasets/DannHiroaki/CMAB-Spatial-Join-0.08B
下载链接
链接失效反馈
官方服务:
资源简介:
--- --- pretty_name: CMAB-Spatial-Join-0.08B license: cc-by-4.0 language: - en tags: - geospatial - spatial-join - benchmark - building-footprints - parquet - china size_categories: - 10M<n<100M task_categories: - other configs: - config_name: summary data_files: - split: train path: "summary_stats.parquet" - config_name: level_1 data_files: - split: train path: "level_1/**/*.parquet" - config_name: level_1_anhui data_files: - split: train path: "level_1/province=anhui/*.parquet" - config_name: level_1_beijing data_files: - split: train path: "level_1/province=beijing/*.parquet" - config_name: level_1_chongqing data_files: - split: train path: "level_1/province=chongqing/*.parquet" - config_name: level_1_fujian data_files: - split: train path: "level_1/province=fujian/*.parquet" - config_name: level_1_gansu data_files: - split: train path: "level_1/province=gansu/*.parquet" - config_name: level_1_guangdong data_files: - split: train path: "level_1/province=guangdong/*.parquet" - config_name: level_1_guangxi data_files: - split: train path: "level_1/province=guangxi/*.parquet" - config_name: level_1_guizhou data_files: - split: train path: "level_1/province=guizhou/*.parquet" - config_name: level_1_hainan data_files: - split: train path: "level_1/province=hainan/*.parquet" - config_name: level_1_hebei data_files: - split: train path: "level_1/province=hebei/*.parquet" - config_name: level_1_heilongjiang data_files: - split: train path: "level_1/province=heilongjiang/*.parquet" - config_name: level_1_henan data_files: - split: train path: "level_1/province=henan/*.parquet" - config_name: level_1_hubei data_files: - split: train path: "level_1/province=hubei/*.parquet" - config_name: level_1_hunan data_files: - split: train path: "level_1/province=hunan/*.parquet" - config_name: level_1_jiangsu data_files: - split: train path: "level_1/province=jiangsu/*.parquet" - config_name: level_1_jiangxi data_files: - split: train path: "level_1/province=jiangxi/*.parquet" - config_name: level_1_jilin data_files: - split: train path: "level_1/province=jilin/*.parquet" - config_name: level_1_liaoning data_files: - split: train path: "level_1/province=liaoning/*.parquet" - config_name: level_1_neimenggu data_files: - split: train path: "level_1/province=neimenggu/*.parquet" - config_name: level_1_ningxia data_files: - split: train path: "level_1/province=ningxia/*.parquet" - config_name: level_1_qinghai data_files: - split: train path: "level_1/province=qinghai/*.parquet" - config_name: level_1_shaanxi data_files: - split: train path: "level_1/province=shaanxi/*.parquet" - config_name: level_1_shandong data_files: - split: train path: "level_1/province=shandong/*.parquet" - config_name: level_1_shanghai data_files: - split: train path: "level_1/province=shanghai/*.parquet" - config_name: level_1_shanxi data_files: - split: train path: "level_1/province=shanxi/*.parquet" - config_name: level_1_sichuan data_files: - split: train path: "level_1/province=sichuan/*.parquet" - config_name: level_1_tianjin data_files: - split: train path: "level_1/province=tianjin/*.parquet" - config_name: level_1_xinjiang data_files: - split: train path: "level_1/province=xinjiang/*.parquet" - config_name: level_1_xizang data_files: - split: train path: "level_1/province=xizang/*.parquet" - config_name: level_1_yunnan data_files: - split: train path: "level_1/province=yunnan/*.parquet" - config_name: level_1_zhejiang data_files: - split: train path: "level_1/province=zhejiang/*.parquet" - config_name: level_2 data_files: - split: train path: "level_2/**/*.parquet" - config_name: level_2_anhui data_files: - split: train path: "level_2/province=anhui/*.parquet" - config_name: level_2_beijing data_files: - split: train path: "level_2/province=beijing/*.parquet" - config_name: level_2_chongqing data_files: - split: train path: "level_2/province=chongqing/*.parquet" - config_name: level_2_fujian data_files: - split: train path: "level_2/province=fujian/*.parquet" - config_name: level_2_gansu data_files: - split: train path: "level_2/province=gansu/*.parquet" - config_name: level_2_guangdong data_files: - split: train path: "level_2/province=guangdong/*.parquet" - config_name: level_2_guangxi data_files: - split: train path: "level_2/province=guangxi/*.parquet" - config_name: level_2_guizhou data_files: - split: train path: "level_2/province=guizhou/*.parquet" - config_name: level_2_hainan data_files: - split: train path: "level_2/province=hainan/*.parquet" - config_name: level_2_hebei data_files: - split: train path: "level_2/province=hebei/*.parquet" - config_name: level_2_heilongjiang data_files: - split: train path: "level_2/province=heilongjiang/*.parquet" - config_name: level_2_henan data_files: - split: train path: "level_2/province=henan/*.parquet" - config_name: level_2_hubei data_files: - split: train path: "level_2/province=hubei/*.parquet" - config_name: level_2_hunan data_files: - split: train path: "level_2/province=hunan/*.parquet" - config_name: level_2_jiangsu data_files: - split: train path: "level_2/province=jiangsu/*.parquet" - config_name: level_2_jiangxi data_files: - split: train path: "level_2/province=jiangxi/*.parquet" - config_name: level_2_jilin data_files: - split: train path: "level_2/province=jilin/*.parquet" - config_name: level_2_liaoning data_files: - split: train path: "level_2/province=liaoning/*.parquet" - config_name: level_2_neimenggu data_files: - split: train path: "level_2/province=neimenggu/*.parquet" - config_name: level_2_ningxia data_files: - split: train path: "level_2/province=ningxia/*.parquet" - config_name: level_2_qinghai data_files: - split: train path: "level_2/province=qinghai/*.parquet" - config_name: level_2_shaanxi data_files: - split: train path: "level_2/province=shaanxi/*.parquet" - config_name: level_2_shandong data_files: - split: train path: "level_2/province=shandong/*.parquet" - config_name: level_2_shanghai data_files: - split: train path: "level_2/province=shanghai/*.parquet" - config_name: level_2_shanxi data_files: - split: train path: "level_2/province=shanxi/*.parquet" - config_name: level_2_sichuan data_files: - split: train path: "level_2/province=sichuan/*.parquet" - config_name: level_2_tianjin data_files: - split: train path: "level_2/province=tianjin/*.parquet" - config_name: level_2_xinjiang data_files: - split: train path: "level_2/province=xinjiang/*.parquet" - config_name: level_2_xizang data_files: - split: train path: "level_2/province=xizang/*.parquet" - config_name: level_2_yunnan data_files: - split: train path: "level_2/province=yunnan/*.parquet" - config_name: level_2_zhejiang data_files: - split: train path: "level_2/province=zhejiang/*.parquet" - config_name: level_3 data_files: - split: train path: "level_3/**/*.parquet" - config_name: level_3_anhui data_files: - split: train path: "level_3/province=anhui/*.parquet" - config_name: level_3_beijing data_files: - split: train path: "level_3/province=beijing/*.parquet" - config_name: level_3_chongqing data_files: - split: train path: "level_3/province=chongqing/*.parquet" - config_name: level_3_fujian data_files: - split: train path: "level_3/province=fujian/*.parquet" - config_name: level_3_gansu data_files: - split: train path: "level_3/province=gansu/*.parquet" - config_name: level_3_guangdong data_files: - split: train path: "level_3/province=guangdong/*.parquet" - config_name: level_3_guangxi data_files: - split: train path: "level_3/province=guangxi/*.parquet" - config_name: level_3_guizhou data_files: - split: train path: "level_3/province=guizhou/*.parquet" - config_name: level_3_hainan data_files: - split: train path: "level_3/province=hainan/*.parquet" - config_name: level_3_hebei data_files: - split: train path: "level_3/province=hebei/*.parquet" - config_name: level_3_heilongjiang data_files: - split: train path: "level_3/province=heilongjiang/*.parquet" - config_name: level_3_henan data_files: - split: train path: "level_3/province=henan/*.parquet" - config_name: level_3_hubei data_files: - split: train path: "level_3/province=hubei/*.parquet" - config_name: level_3_hunan data_files: - split: train path: "level_3/province=hunan/*.parquet" - config_name: level_3_jiangsu data_files: - split: train path: "level_3/province=jiangsu/*.parquet" - config_name: level_3_jiangxi data_files: - split: train path: "level_3/province=jiangxi/*.parquet" - config_name: level_3_jilin data_files: - split: train path: "level_3/province=jilin/*.parquet" - config_name: level_3_liaoning data_files: - split: train path: "level_3/province=liaoning/*.parquet" - config_name: level_3_neimenggu data_files: - split: train path: "level_3/province=neimenggu/*.parquet" - config_name: level_3_ningxia data_files: - split: train path: "level_3/province=ningxia/*.parquet" - config_name: level_3_qinghai data_files: - split: train path: "level_3/province=qinghai/*.parquet" - config_name: level_3_shaanxi data_files: - split: train path: "level_3/province=shaanxi/*.parquet" - config_name: level_3_shandong data_files: - split: train path: "level_3/province=shandong/*.parquet" - config_name: level_3_shanghai data_files: - split: train path: "level_3/province=shanghai/*.parquet" - config_name: level_3_shanxi data_files: - split: train path: "level_3/province=shanxi/*.parquet" - config_name: level_3_sichuan data_files: - split: train path: "level_3/province=sichuan/*.parquet" - config_name: level_3_tianjin data_files: - split: train path: "level_3/province=tianjin/*.parquet" - config_name: level_3_xinjiang data_files: - split: train path: "level_3/province=xinjiang/*.parquet" - config_name: level_3_xizang data_files: - split: train path: "level_3/province=xizang/*.parquet" - config_name: level_3_yunnan data_files: - split: train path: "level_3/province=yunnan/*.parquet" - config_name: level_3_zhejiang data_files: - split: train path: "level_3/province=zhejiang/*.parquet" - config_name: level_4 data_files: - split: train path: "level_4/**/*.parquet" - config_name: level_4_anhui data_files: - split: train path: "level_4/province=anhui/*.parquet" - config_name: level_4_beijing data_files: - split: train path: "level_4/province=beijing/*.parquet" - config_name: level_4_chongqing data_files: - split: train path: "level_4/province=chongqing/*.parquet" - config_name: level_4_fujian data_files: - split: train path: "level_4/province=fujian/*.parquet" - config_name: level_4_gansu data_files: - split: train path: "level_4/province=gansu/*.parquet" - config_name: level_4_guangdong data_files: - split: train path: "level_4/province=guangdong/*.parquet" - config_name: level_4_guangxi data_files: - split: train path: "level_4/province=guangxi/*.parquet" - config_name: level_4_guizhou data_files: - split: train path: "level_4/province=guizhou/*.parquet" - config_name: level_4_hainan data_files: - split: train path: "level_4/province=hainan/*.parquet" - config_name: level_4_hebei data_files: - split: train path: "level_4/province=hebei/*.parquet" - config_name: level_4_heilongjiang data_files: - split: train path: "level_4/province=heilongjiang/*.parquet" - config_name: level_4_henan data_files: - split: train path: "level_4/province=henan/*.parquet" - config_name: level_4_hubei data_files: - split: train path: "level_4/province=hubei/*.parquet" - config_name: level_4_hunan data_files: - split: train path: "level_4/province=hunan/*.parquet" - config_name: level_4_jiangsu data_files: - split: train path: "level_4/province=jiangsu/*.parquet" - config_name: level_4_jiangxi data_files: - split: train path: "level_4/province=jiangxi/*.parquet" - config_name: level_4_jilin data_files: - split: train path: "level_4/province=jilin/*.parquet" - config_name: level_4_liaoning data_files: - split: train path: "level_4/province=liaoning/*.parquet" - config_name: level_4_neimenggu data_files: - split: train path: "level_4/province=neimenggu/*.parquet" - config_name: level_4_ningxia data_files: - split: train path: "level_4/province=ningxia/*.parquet" - config_name: level_4_qinghai data_files: - split: train path: "level_4/province=qinghai/*.parquet" - config_name: level_4_shaanxi data_files: - split: train path: "level_4/province=shaanxi/*.parquet" - config_name: level_4_shandong data_files: - split: train path: "level_4/province=shandong/*.parquet" - config_name: level_4_shanghai data_files: - split: train path: "level_4/province=shanghai/*.parquet" - config_name: level_4_shanxi data_files: - split: train path: "level_4/province=shanxi/*.parquet" - config_name: level_4_sichuan data_files: - split: train path: "level_4/province=sichuan/*.parquet" - config_name: level_4_tianjin data_files: - split: train path: "level_4/province=tianjin/*.parquet" - config_name: level_4_xinjiang data_files: - split: train path: "level_4/province=xinjiang/*.parquet" - config_name: level_4_xizang data_files: - split: train path: "level_4/province=xizang/*.parquet" - config_name: level_4_yunnan data_files: - split: train path: "level_4/province=yunnan/*.parquet" - config_name: level_4_zhejiang data_files: - split: train path: "level_4/province=zhejiang/*.parquet" --- # Introduction A lightweight **axis-aligned rectangle (AABB) benchmark** derived from the **CMAB** building rooftop dataset, designed for evaluating **spatial join** pipelines (indexing, vectorized filtering, output materialization). This dataset stores **base AABBs** and **expanded (influence) AABBs** for each building under **4 workload levels**. It is organized as **Parquet shards**, partitioned by **level** and **province**. Dataset construction details and the reference builder are available at: https://github.com/DANNHIROAKI/CMAB-Spatial-Join-0.08B-Builder # How to Use #### 1) Download a small slice (recommended) ```python from huggingface_hub import snapshot_download # Download only what you need (e.g., Level-1 + Beijing) with snapshot_download local_dir = snapshot_download( repo_id="DannHiroaki/CMAB-Spatial-Join-0.08B", repo_type="dataset", allow_patterns=[ "dataset_metadata.json", "file_manifest.parquet", "summary_stats.parquet", "level_1/province=beijing/*.parquet", ], ) print("Downloaded to:", local_dir) ``` #### 2) Read a shard with Polars ```python import polars as pl from pathlib import Path p = Path(local_dir) / "level_1" / "province=beijing" / "data_00001.parquet" df = pl.read_parquet(p) print(df.select(["building_uid","func","level","d_m","xmin","ymin","xmax","ymax","exmin","eymin","exmax","eymax"]).head()) ``` #### 3) Query with DuckDB (fast ad-hoc analytics) ```python import duckdb from pathlib import Path glob_path = str(Path(local_dir) / "level_1" / "province=beijing" / "*.parquet") con = duckdb.connect() n = con.execute(f""" SELECT COUNT(*) FROM read_parquet('{glob_path}') WHERE func = 'Residential' """).fetchone()[0] print("Residential rows (level_1, beijing):", n) ``` #### 4) Use `file_manifest.parquet` to discover shards ```python import polars as pl from pathlib import Path mf = pl.read_parquet(Path(local_dir) / "file_manifest.parquet") print(mf.select(["level","province","path","num_rows"]).sort(["level","province"]).head(20)) ``` # Attribution This benchmark is derived from **CMAB**: * Paper: *CMAB: A Multi-Attribute Building Dataset of China* ```latex @article{Zhang2025SciData, author = {Zhang, Y. and Zhao, H. and Long, Y.}, title = {{CMAB: A Multi-Attribute Building Dataset of China}}, journal = {Scientific Data}, volume = {12}, number = {430}, year = {2025}, doi = {10.1038/s41597-025-04730-5}, url = {https://doi.org/10.1038/s41597-025-04730-5} } ``` * Dataset: *CMAB-The World’s First National-Scale Multi-Attribute Building Dataset* ```latex @misc{Zhang2025CMAB, author = {Zhang, Yecheng and Zhao, Huimin and Long, Ying}, title = {{CMAB-The World's First National-Scale Multi-Attribute Building Dataset}}, year = {2025}, month = apr, publisher = {figshare}, doi = {10.6084/m9.figshare.27992417}, url = {https://doi.org/10.6084/m9.figshare.27992417}, howpublished = {dataset} } ``` If you use this dataset in research, please cite the CMAB paper/dataset and reference this benchmark repository.
提供机构:
DannHiroaki
5,000+
优质数据集
54 个
任务类型
进入经典数据集
二维码
社区交流群

面向社区/商业的数据集话题

二维码
科研交流群

面向高校/科研机构的开源数据集话题

数据驱动未来

携手共赢发展

商业合作