five

MihaiPopa-1/minecraft-skins-1.1m-deduped-64x64

收藏
Hugging Face2026-04-02 更新2026-04-12 收录
下载链接:
https://hf-mirror.com/datasets/MihaiPopa-1/minecraft-skins-1.1m-deduped-64x64
下载链接
链接失效反馈
官方服务:
资源简介:
--- task_categories: - text-to-image - image-classification - unconditional-image-generation tags: - minecraft - minecraft-skins - de-duped - deduped - zip-dataset - zip - zip-archive size_categories: - 1M<n<10M license: apache-2.0 pretty_name: Minecraft Skins 1.1M Deduped (64x64 Edition) --- # Minecraft Skins 1.1M Deduped (64x64 Edition)! [Nyuuzyou's Minecraft-Skins-20M](https://huggingface.co/datasets/nyuuzyou/Minecraft-Skins-20M) but deduped using BLAKE3 hashes (only catches if pixel values are exactly the same), then filtered to only valid 64x64 skins. Format is a 2.6 GB ZIP archive containing 64x64 PNG skin files. # Tools used PIL Image (Python), Google Colab (free CPU tier) and BLAKE3 (Python) # How it was made 1. Loaded [Nyuuzyou's Minecraft-Skins-20M](https://huggingface.co/datasets/nyuuzyou/Minecraft-Skins-20M), 2. Deduped using BLAKE3, only catching if pixel values are exactly the same. Tiny differences are kept, 3. Result is 1296966 unique skins. 4. Invalid, 64x32 skins or other skin sizes are removed, 5. Result is 1107411 64x64 skins. 6. Output is given in a 2.6 GB ZIP archive. This can be used to make your own skin generation model (but I'm going with VQ-VAE anyway!) # Future improvements for version 2 1. Captioning (with Florence 2 Base) 2. Filtering troll skins (skins that are formed of just a single color) # Code Code to reproduce it (all by Claude 4.6 Sonnet): ```python # ============================================================ # Minecraft Skin Deduplicator # Downloads nyuuzyou/Minecraft-Skins-20M, hashes pixel data # with BLAKE3, deduplicates via SQLite, saves unique skins. # ============================================================ # --- 1. Install dependencies --- !pip install blake3 datasets Pillow numpy -q import blake3 import numpy as np import sqlite3 import os from PIL import Image from datasets import load_dataset import base64 from io import BytesIO # --- 2. Config --- OUTPUT_DIR = "/content/unique_skins" # where to save unique skins DB_PATH = "/content/hashes.db" # SQLite dedup DB (move to Drive to persist!) MOJANG_DEFAULT_PATHS = [] # optional: put paths to Steve/Alex/etc. here os.makedirs(OUTPUT_DIR, exist_ok=True) # --- 3. Load known Mojang default hashes (optional but recommended) --- def hash_pixels(img: Image.Image) -> bytes: """Hash the raw pixel data of a PIL image. Returns 32-byte BLAKE3 digest.""" arr = np.array(img.convert("RGBA")) return blake3.blake3(arr.tobytes()).digest() blacklist = set() for path in MOJANG_DEFAULT_PATHS: img = Image.open(path) blacklist.add(hash_pixels(img)) print(f"Blacklisted {len(blacklist)} default Mojang skins.") # --- 4. Set up SQLite --- conn = sqlite3.connect(DB_PATH) conn.execute("CREATE TABLE IF NOT EXISTS seen (h BLOB PRIMARY KEY)") conn.execute("PRAGMA journal_mode=WAL") # faster concurrent writes conn.execute("PRAGMA synchronous=NORMAL") # safe but faster than FULL conn.commit() def already_seen(digest: bytes) -> bool: return conn.execute("SELECT 1 FROM seen WHERE h=?", (digest,)).fetchone() is not None def mark_seen(digest: bytes): conn.execute("INSERT OR IGNORE INTO seen VALUES (?)", (digest,)) # --- 5. Stream & deduplicate --- print("Loading dataset (streaming)...") dataset = load_dataset( "nyuuzyou/Minecraft-Skins-20M", split="train", streaming=True, # <-- key: don't download everything at once ) total = 0 duplicates = 0 blacklisted = 0 saved = 0 BATCH_SIZE = 500 # commit to SQLite every N rows for i, row in enumerate(dataset): total += 1 # The image field — adjust key if the dataset uses a different column name raw = row.get("image") or row.get("skin") or row.get("img") if raw is None: continue # Handle both PIL images and raw bytes try: if isinstance(raw, str): img = Image.open(BytesIO(base64.b64decode(raw))) elif isinstance(raw, bytes): img = Image.open(BytesIO(raw)) else: img = raw # already a PIL Image except Exception: continue # skip corrupt/malformed entries digest = hash_pixels(img) if digest in blacklist: blacklisted += 1 continue if already_seen(digest): duplicates += 1 continue # Save unique skin out_path = os.path.join(OUTPUT_DIR, f"skin_{saved:08d}.png") img.save(out_path, format="PNG") mark_seen(digest) saved += 1 # Batch commit if total % BATCH_SIZE == 0: conn.commit() # Progress if total % 10_000 == 0: print(f" Processed: {total:,} | Saved: {saved:,} | Dupes: {duplicates:,} | Blacklisted: {blacklisted:,}") conn.commit() conn.close() print("\n=== Done! ===") print(f"Total processed : {total:,}") print(f"Unique skins : {saved:,}") print(f"Duplicates : {duplicates:,}") print(f"Blacklisted : {blacklisted:,}") print(f"Reduction : {100 * (1 - saved/max(total,1)):.1f}%") ``` then: ```python import os import zipfile from PIL import Image from tqdm import tqdm INPUT_DIR = "/content/unique_skins" OUTPUT_DIR = "/content/filtered_skins" ZIP_PATH = "/content/minecraft_skins_64x64.zip" os.makedirs(OUTPUT_DIR, exist_ok=True) # Step 1: Filter to 64x64 only print("Filtering to 64x64...") all_skins = os.listdir(INPUT_DIR) filtered = 0 skipped = 0 for filename in tqdm(all_skins): if not filename.endswith(".png"): continue path = os.path.join(INPUT_DIR, filename) try: img = Image.open(path) if img.size == (64, 64): img.close() filtered += 1 else: os.remove(path) # delete non-64x64 skipped += 1 except Exception: os.remove(path) # delete corrupt files skipped += 1 print(f"Kept: {filtered:,} | Skipped/removed: {skipped:,}") # Step 2: Pack into ZIP print("\nPacking into ZIP...") skin_files = [f for f in os.listdir(INPUT_DIR) if f.endswith(".png")] with zipfile.ZipFile(ZIP_PATH, "w", zipfile.ZIP_DEFLATED, compresslevel=1) as zf: for filename in tqdm(skin_files): zf.write(os.path.join(INPUT_DIR, filename), arcname=filename) print(f"\nDone! ZIP saved to {ZIP_PATH}") print(f"ZIP size: {os.path.getsize(ZIP_PATH) / 1024 / 1024:.1f} MB") ```
提供机构:
MihaiPopa-1
5,000+
优质数据集
54 个
任务类型
进入经典数据集
二维码
社区交流群

面向社区/商业的数据集话题

二维码
科研交流群

面向高校/科研机构的开源数据集话题

数据驱动未来

携手共赢发展

商业合作