下载链接：

https://modelscope.cn/datasets/EleutherAI/fake-mnist

下载链接

链接失效反馈

官方服务：

资源简介：

This is a dataset of "fake" MNIST images which were sampled from a high-entropy distribution whose mean and covariance matrix matches that of the original MNIST. It was generated with the following code: ```py from datasets import ClassLabel, Dataset, DatasetDict, Features, Image, load_dataset from functools import partial def generator(split: str): from datasets import Dataset from concept_erasure import assert_type, groupby, optimal_linear_shrinkage from concept_erasure.optimal_transport import psd_sqrt from PIL import Image as PilImage from torch import nn, optim, Tensor import torch def koleo(x: Tensor) -> Tensor: """Kozachenko-Leonenko estimator of entropy.""" return torch.cdist(x, x).kthvalue(2).values.log().mean() def hypercube_sample( n: int, mean: Tensor, cov: Tensor, *, koleo_weight: float = 1e-3, max_iter: int = 100, seed: int = 0, ): """Generate `n` samples from a distribution on [0, 1]^d with the given moments.""" d = mean.shape[-1] assert d == cov.shape[-1] == cov.shape[-2], "Dimension mismatch" assert n > 1, "Need at least two samples to compute covariance" eps = torch.finfo(mean.dtype).eps rng = torch.Generator(device=mean.device).manual_seed(seed) # Initialize with max-ent samples matching `mean` and `cov` but without hypercube # constraint. We do so in a way that is robust to singular `cov` z = mean.new_empty([n, d]).normal_(generator=rng) x = torch.clamp(z @ psd_sqrt(cov) + mean, eps, 1 - eps) # Reparametrize to enforce hypercube constraint z = nn.Parameter(x.logit()) opt = optim.LBFGS([z], line_search_fn="strong_wolfe", max_iter=max_iter) def closure(): opt.zero_grad() x = z.sigmoid() loss = torch.norm(x.mean(0) - mean) + torch.norm(x.T.cov() - cov) loss -= koleo_weight * koleo(x) loss.backward() return float(loss) opt.step(closure) return z.sigmoid().detach() ds = assert_type(Dataset, load_dataset("mnist", split=split)) with ds.formatted_as("torch"): X = assert_type(Tensor, ds["image"]).div(255).cuda() Y = assert_type(Tensor, ds["label"]).cuda() # Iterate over the classes for y, x in groupby(X, Y): mean = x.flatten(1).mean(0) cov = optimal_linear_shrinkage(x.flatten(1).mT.cov(), len(x)) for fake_x in hypercube_sample(len(x), mean, cov).reshape_as(x).mul(255).cpu(): yield {"image": PilImage.fromarray(fake_x.numpy()).convert("L"), "label": y} features = Features({ "image": Image(), "label": ClassLabel(num_classes=10), }) fake_train = Dataset.from_generator(partial(generator, "train"), features) fake_test = Dataset.from_generator(partial(generator, "test"), features) fake = DatasetDict({"train": fake_train, "test": fake_test}) fake.push_to_hub("EleutherAI/fake-mnist") ```

本数据集为"伪"MNIST（MNIST）图像数据集，其采样自均值与协方差矩阵与原始MNIST数据集一致的高熵分布。该数据集通过如下代码生成： py from datasets import ClassLabel, Dataset, DatasetDict, Features, Image, load_dataset from functools import partial def generator(split: str): from datasets import Dataset from concept_erasure import assert_type, groupby, optimal_linear_shrinkage from concept_erasure.optimal_transport import psd_sqrt from PIL import Image as PilImage from torch import nn, optim, Tensor import torch def koleo(x: Tensor) -> Tensor: """科扎琴科-列昂尼科（Kozachenko-Leonenko）熵估计器。""" return torch.cdist(x, x).kthvalue(2).values.log().mean() def hypercube_sample( n: int, mean: Tensor, cov: Tensor, *, koleo_weight: float = 1e-3, max_iter: int = 100, seed: int = 0, ): """从给定矩的[0,1]^d空间分布中生成`n`个样本。""" d = mean.shape[-1] assert d == cov.shape[-1] == cov.shape[-2], "维度不匹配" assert n > 1, "至少需要两个样本以计算协方差" eps = torch.finfo(mean.dtype).eps rng = torch.Generator(device=mean.device).manual_seed(seed) # 从匹配给定均值与协方差但不满足超立方体约束的最大熵样本初始化，该方法对奇异协方差矩阵具有鲁棒性 z = mean.new_empty([n, d]).normal_(generator=rng) x = torch.clamp(z @ psd_sqrt(cov) + mean, eps, 1 - eps) # 重参数化以满足超立方体约束 z = nn.Parameter(x.logit()) opt = optim.LBFGS([z], line_search_fn="strong_wolfe", max_iter=max_iter) def closure(): opt.zero_grad() x = z.sigmoid() loss = torch.norm(x.mean(0) - mean) + torch.norm(x.T.cov() - cov) loss -= koleo_weight * koleo(x) loss.backward() return float(loss) opt.step(closure) return z.sigmoid().detach() ds = assert_type(Dataset, load_dataset("mnist", split=split)) with ds.formatted_as("torch"): X = assert_type(Tensor, ds["image"]).div(255).cuda() Y = assert_type(Tensor, ds["label"]).cuda() # 按类别迭代处理 for y, x in groupby(X, Y): mean = x.flatten(1).mean(0) cov = optimal_linear_shrinkage(x.flatten(1).mT.cov(), len(x)) for fake_x in hypercube_sample(len(x), mean, cov).reshape_as(x).mul(255).cpu(): yield {"image": PilImage.fromarray(fake_x.numpy()).convert("L"), "label": y} features = Features({ "image": Image(), "label": ClassLabel(num_classes=10), }) fake_train = Dataset.from_generator(partial(generator, "train"), features) fake_test = Dataset.from_generator(partial(generator, "test"), features) fake = DatasetDict({"train": fake_train, "test": fake_test}) fake.push_to_hub("EleutherAI/fake-mnist")

应用场景：