unlearning-cleanslate/fsid-curated-olmo-7b
收藏Hugging Face2026-04-28 更新2026-05-03 收录
下载链接:
https://hf-mirror.com/datasets/unlearning-cleanslate/fsid-curated-olmo-7b
下载链接
链接失效反馈官方服务:
资源简介:
---
dataset_info:
- config_name: forget
features:
- name: request_id
dtype: string
- name: content_id
dtype: string
- name: content_title
dtype: string
- name: window_idx
dtype: int64
- name: prefix
dtype: string
- name: suffix
dtype: string
- name: memorized_fraction
dtype: float64
- name: rule_name
dtype: string
splits:
- name: baseline
num_bytes: 2681566
num_examples: 8084
- name: bm25_10B
num_bytes: 1599187
num_examples: 4839
- name: bm25_6T
num_bytes: 803167
num_examples: 2432
- name: igm_10B
num_bytes: 1464743
num_examples: 4442
download_size: 1131722
dataset_size: 6548663
- config_name: forget_pool
features:
- name: content_id
dtype: string
- name: content_title
dtype: string
- name: content_creators
list: string
- name: content_year
dtype: int64
- name: lyrics
dtype: string
- name: memorized_fraction
dtype: float64
- name: max_p_z
dtype: float64
- name: num_windows
dtype: int64
- name: memorized_windows
dtype: int64
- name: source_dataset
dtype: string
- name: pool_bin
dtype: int64
splits:
- name: train
num_bytes: 95697
num_examples: 38
download_size: 44449
dataset_size: 95697
- config_name: retain
features:
- name: text
dtype: string
- name: rule_name
dtype: string
splits:
- name: baseline
num_bytes: 1156576
num_examples: 1646
- name: bm25_10B
num_bytes: 1153284
num_examples: 1646
- name: bm25_6T
num_bytes: 1151638
num_examples: 1646
- name: igm_10B
num_bytes: 1151638
num_examples: 1646
download_size: 2540876
dataset_size: 4613136
- config_name: retain_pool
features:
- name: text_length_chars
dtype: int64
- name: num_windows
dtype: int64
- name: memorized_windows
dtype: int64
- name: memorized_fraction
dtype: float64
- name: coverage
dtype: float64
- name: max_p_z
dtype: float64
- name: mean_p_z
dtype: float64
- name: median_p_z
dtype: float64
- name: min_p_z
dtype: float64
- name: std_p_z
dtype: float64
- name: best_window_idx
dtype: int64
- name: best_window_p_z
dtype: float64
- name: best_window_seed
dtype: string
- name: best_window_target
dtype: string
- name: best_window_start_char
dtype: int64
- name: best_window_end_char
dtype: int64
- name: eval_model
dtype: string
- name: window_size
dtype: int64
- name: stride
dtype: int64
- name: eval_threshold
dtype: float64
- name: windows
list:
- name: end_char
dtype: int64
- name: idx
dtype: int64
- name: is_memorized
dtype: bool
- name: log_prob
dtype: float64
- name: num_target_tokens
dtype: int64
- name: p_z
dtype: float64
- name: seed
dtype: string
- name: start_char
dtype: int64
- name: target
dtype: string
- name: target_log_probs
list: float64
- name: target_ranks
list: int64
- name: content_id
dtype: string
- name: content_title
dtype: string
- name: content_creators
dtype: string
- name: content_year
dtype: int64
splits:
- name: train
num_bytes: 2731653349
num_examples: 4625
download_size: 2739486671
dataset_size: 2731653349
configs:
- config_name: forget
data_files:
- split: baseline
path: forget/baseline-*
- split: bm25_10B
path: forget/bm25_10B-*
- split: bm25_6T
path: forget/bm25_6T-*
- split: igm_10B
path: forget/igm_10B-*
- config_name: forget_pool
data_files:
- split: train
path: forget_pool/train-*
- config_name: retain
data_files:
- split: baseline
path: retain/baseline-*
- split: bm25_10B
path: retain/bm25_10B-*
- split: bm25_6T
path: retain/bm25_6T-*
- split: igm_10B
path: retain/igm_10B-*
- config_name: retain_pool
data_files:
- split: train
path: retain_pool/train-*
---
提供机构:
unlearning-cleanslate



