MihaiPopa-1/minecraft-skins-1.1m-deduped-64x64
收藏Hugging Face2026-04-02 更新2026-04-12 收录
下载链接:
https://hf-mirror.com/datasets/MihaiPopa-1/minecraft-skins-1.1m-deduped-64x64
下载链接
链接失效反馈官方服务:
资源简介:
---
task_categories:
- text-to-image
- image-classification
- unconditional-image-generation
tags:
- minecraft
- minecraft-skins
- de-duped
- deduped
- zip-dataset
- zip
- zip-archive
size_categories:
- 1M<n<10M
license: apache-2.0
pretty_name: Minecraft Skins 1.1M Deduped (64x64 Edition)
---
# Minecraft Skins 1.1M Deduped (64x64 Edition)!
[Nyuuzyou's Minecraft-Skins-20M](https://huggingface.co/datasets/nyuuzyou/Minecraft-Skins-20M) but deduped using BLAKE3 hashes (only catches if pixel values are exactly the same), then filtered to only valid 64x64 skins.
Format is a 2.6 GB ZIP archive containing 64x64 PNG skin files.
# Tools used
PIL Image (Python), Google Colab (free CPU tier) and BLAKE3 (Python)
# How it was made
1. Loaded [Nyuuzyou's Minecraft-Skins-20M](https://huggingface.co/datasets/nyuuzyou/Minecraft-Skins-20M),
2. Deduped using BLAKE3, only catching if pixel values are exactly the same. Tiny differences are kept,
3. Result is 1296966 unique skins.
4. Invalid, 64x32 skins or other skin sizes are removed,
5. Result is 1107411 64x64 skins.
6. Output is given in a 2.6 GB ZIP archive.
This can be used to make your own skin generation model (but I'm going with VQ-VAE anyway!)
# Future improvements for version 2
1. Captioning (with Florence 2 Base)
2. Filtering troll skins (skins that are formed of just a single color)
# Code
Code to reproduce it (all by Claude 4.6 Sonnet):
```python
# ============================================================
# Minecraft Skin Deduplicator
# Downloads nyuuzyou/Minecraft-Skins-20M, hashes pixel data
# with BLAKE3, deduplicates via SQLite, saves unique skins.
# ============================================================
# --- 1. Install dependencies ---
!pip install blake3 datasets Pillow numpy -q
import blake3
import numpy as np
import sqlite3
import os
from PIL import Image
from datasets import load_dataset
import base64
from io import BytesIO
# --- 2. Config ---
OUTPUT_DIR = "/content/unique_skins" # where to save unique skins
DB_PATH = "/content/hashes.db" # SQLite dedup DB (move to Drive to persist!)
MOJANG_DEFAULT_PATHS = [] # optional: put paths to Steve/Alex/etc. here
os.makedirs(OUTPUT_DIR, exist_ok=True)
# --- 3. Load known Mojang default hashes (optional but recommended) ---
def hash_pixels(img: Image.Image) -> bytes:
"""Hash the raw pixel data of a PIL image. Returns 32-byte BLAKE3 digest."""
arr = np.array(img.convert("RGBA"))
return blake3.blake3(arr.tobytes()).digest()
blacklist = set()
for path in MOJANG_DEFAULT_PATHS:
img = Image.open(path)
blacklist.add(hash_pixels(img))
print(f"Blacklisted {len(blacklist)} default Mojang skins.")
# --- 4. Set up SQLite ---
conn = sqlite3.connect(DB_PATH)
conn.execute("CREATE TABLE IF NOT EXISTS seen (h BLOB PRIMARY KEY)")
conn.execute("PRAGMA journal_mode=WAL") # faster concurrent writes
conn.execute("PRAGMA synchronous=NORMAL") # safe but faster than FULL
conn.commit()
def already_seen(digest: bytes) -> bool:
return conn.execute("SELECT 1 FROM seen WHERE h=?", (digest,)).fetchone() is not None
def mark_seen(digest: bytes):
conn.execute("INSERT OR IGNORE INTO seen VALUES (?)", (digest,))
# --- 5. Stream & deduplicate ---
print("Loading dataset (streaming)...")
dataset = load_dataset(
"nyuuzyou/Minecraft-Skins-20M",
split="train",
streaming=True, # <-- key: don't download everything at once
)
total = 0
duplicates = 0
blacklisted = 0
saved = 0
BATCH_SIZE = 500 # commit to SQLite every N rows
for i, row in enumerate(dataset):
total += 1
# The image field — adjust key if the dataset uses a different column name
raw = row.get("image") or row.get("skin") or row.get("img")
if raw is None:
continue
# Handle both PIL images and raw bytes
try:
if isinstance(raw, str):
img = Image.open(BytesIO(base64.b64decode(raw)))
elif isinstance(raw, bytes):
img = Image.open(BytesIO(raw))
else:
img = raw # already a PIL Image
except Exception:
continue # skip corrupt/malformed entries
digest = hash_pixels(img)
if digest in blacklist:
blacklisted += 1
continue
if already_seen(digest):
duplicates += 1
continue
# Save unique skin
out_path = os.path.join(OUTPUT_DIR, f"skin_{saved:08d}.png")
img.save(out_path, format="PNG")
mark_seen(digest)
saved += 1
# Batch commit
if total % BATCH_SIZE == 0:
conn.commit()
# Progress
if total % 10_000 == 0:
print(f" Processed: {total:,} | Saved: {saved:,} | Dupes: {duplicates:,} | Blacklisted: {blacklisted:,}")
conn.commit()
conn.close()
print("\n=== Done! ===")
print(f"Total processed : {total:,}")
print(f"Unique skins : {saved:,}")
print(f"Duplicates : {duplicates:,}")
print(f"Blacklisted : {blacklisted:,}")
print(f"Reduction : {100 * (1 - saved/max(total,1)):.1f}%")
```
then:
```python
import os
import zipfile
from PIL import Image
from tqdm import tqdm
INPUT_DIR = "/content/unique_skins"
OUTPUT_DIR = "/content/filtered_skins"
ZIP_PATH = "/content/minecraft_skins_64x64.zip"
os.makedirs(OUTPUT_DIR, exist_ok=True)
# Step 1: Filter to 64x64 only
print("Filtering to 64x64...")
all_skins = os.listdir(INPUT_DIR)
filtered = 0
skipped = 0
for filename in tqdm(all_skins):
if not filename.endswith(".png"):
continue
path = os.path.join(INPUT_DIR, filename)
try:
img = Image.open(path)
if img.size == (64, 64):
img.close()
filtered += 1
else:
os.remove(path) # delete non-64x64
skipped += 1
except Exception:
os.remove(path) # delete corrupt files
skipped += 1
print(f"Kept: {filtered:,} | Skipped/removed: {skipped:,}")
# Step 2: Pack into ZIP
print("\nPacking into ZIP...")
skin_files = [f for f in os.listdir(INPUT_DIR) if f.endswith(".png")]
with zipfile.ZipFile(ZIP_PATH, "w", zipfile.ZIP_DEFLATED, compresslevel=1) as zf:
for filename in tqdm(skin_files):
zf.write(os.path.join(INPUT_DIR, filename), arcname=filename)
print(f"\nDone! ZIP saved to {ZIP_PATH}")
print(f"ZIP size: {os.path.getsize(ZIP_PATH) / 1024 / 1024:.1f} MB")
```
提供机构:
MihaiPopa-1



