perlthoughts/gefilte-fish
收藏Hugging Face2023-12-20 更新2024-03-04 收录
下载链接:
https://hf-mirror.com/datasets/perlthoughts/gefilte-fish
下载链接
链接失效反馈官方服务:
资源简介:
---
license: apache-2.0
---
code
```python
# used when training samples do not include a system prompt.
DEFAULT_SYSTEM_PROMPT = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
# did not add item to dataset if prompt or system prompt contains any of these bad words.
BAD_WORDS = [
"english", "translate", "russian", "chinese", "japanese", "spanish", "persian", "french", "german", "italian", "korean",
"arabic", "hindi", "portuguese", "turkish", "vietnamese", "indonesian", "thai", "polish", "dutch", "greek", "czech",
"romanian", "swedish", "danish", "finnish", "hungarian", "norwegian", "slovak", "slovenian", "lithuanian", "latvian",
"estonian", "bulgarian", "serbian", "ukrainian", "belarusian", "croatian", "bosnian", "macedonian", "albanian", "icelandic",
"irish", "welsh", "scottish", "latin", "esperanto", "hebrew", "yiddish", "afrikaans", "swahili", "zulu", "xhosa", "sotho",
"sesotho", "somali", "hausa", "igbo", "yoruba", "malay", "tagalog", "hawaiian", "maori", "mongolian", "tamil", "telugu",
"kannada", "gujarati", "marathi", "punjabi", "nepali", "sinhala", "khmer", "lao", "burmese", "tibetan", "georgian",
"azerbaijani", "kurdish", "armenian", "kazakh", "uzbek", "tajik", "kirghiz", "turkmen", "tatar", "bashkir", "chechen",
"chuvash", "ossetian", "moldavian", "moldovan", "language model", " AI ", "openai", "gpt", "gpt-2", "gpt-3", "gpt2", "gpt3", "gpt4",
"gpt-4", "illegal", "harmful", "cannot provide", "yourself or others", "harm to yourself", "cannot suggest", "morals", "ethical",
"cannot answer", "can't answer", "don't know", "no answer", "no response", "i can't", "not enough information", "insufficient",
"it is not possible", "not answerable", "unfortunately", "can't answer", "am not sure", "davinci-0", "ada-0", "babbage-0", "curie-0",
]
TOTAL_ITEMS = 100000
# all datasets used and the percentage/ratio of each from the total.
DATASETS = {
"migtissera/Synthia-v1.3": {
"ratio": 0.2, "set": "train",
"system": "system", "prompt": "instruction", "output": "response",
},
"meta-math/MetaMathQA": {
"ratio": 0.1, "set": "train",
"system": DEFAULT_SYSTEM_PROMPT, "prompt": "query", "output": "response",
},
"HuggingFaceH4/ultrafeedback_binarized": {
"ratio": 0.3, "set": "train_sft",
"system": DEFAULT_SYSTEM_PROMPT, "prompt": "prompt", "output": "get_assistant(chosen)",
},
"ehartford/dolphin": {
"ratio": 0.3, "set": "train",
"system": "instruction", "prompt": "input", "output": "output",
},
"Open-Orca/OpenOrca": {
"ratio": 0.1, "set": "train",
"system": "system_prompt", "prompt": "question", "output": "response",
},
}
```
提供机构:
perlthoughts
原始信息汇总
数据集概述
许可证
- Apache-2.0
默认系统提示
DEFAULT_SYSTEM_PROMPT:"Below is an instruction that describes a task. Write a response that appropriately completes the request."
不良词汇列表
BAD_WORDS:包含多种语言名称、AI相关词汇以及一些负面或限制性词汇。
数据集总项数
TOTAL_ITEMS:100000
数据集组成
DATASETS:包含多个数据集及其在总数据集中的比例和用途。migtissera/Synthia-v1.3:比例0.2,用于训练,包含系统提示、指令和响应。meta-math/MetaMathQA:比例0.1,用于训练,使用默认系统提示、查询和响应。HuggingFaceH4/ultrafeedback_binarized:比例0.3,用于训练SFT,使用默认系统提示、提示和助手选择。ehartford/dolphin:比例0.3,用于训练,包含指令、输入和输出。Open-Orca/OpenOrca:比例0.1,用于训练,包含系统提示、问题和响应。



