perlthoughts/big-brain-4k
收藏Hugging Face2023-12-20 更新2024-03-04 收录
下载链接:
https://hf-mirror.com/datasets/perlthoughts/big-brain-4k
下载链接
链接失效反馈官方服务:
资源简介:
---
license: apache-2.0
---
code
```python
# used when training samples do not include a system prompt.
DEFAULT_SYSTEM_PROMPT = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
# if any of these words are in the system or prompt, the item will be skipped.
BAD_WORDS = [
"english", "translate", "russian", "chinese", "japanese", "spanish", "persian", "french", "german", "italian", "korean",
"arabic", "hindi", "portuguese", "turkish", "vietnamese", "indonesian", "thai", "polish", "dutch", "greek", "czech",
"romanian", "swedish", "danish", "finnish", "hungarian", "norwegian", "slovak", "slovenian", "lithuanian", "latvian",
"estonian", "bulgarian", "serbian", "ukrainian", "belarusian", "croatian", "bosnian", "macedonian", "albanian", "icelandic",
"irish", "welsh", "scottish", "latin", "esperanto", "hebrew", "yiddish", "afrikaans", "swahili", "zulu", "xhosa", "sotho",
"sesotho", "somali", "hausa", "igbo", "yoruba", "malay", "tagalog", "hawaiian", "maori", "mongolian", "tamil", "telugu",
"kannada", "gujarati", "marathi", "punjabi", "nepali", "sinhala", "khmer", "lao", "burmese", "tibetan", "georgian",
"azerbaijani", "kurdish", "armenian", "kazakh", "uzbek", "tajik", "kirghiz", "turkmen", "tatar", "bashkir", "chechen",
"chuvash", "ossetian", "moldavian", "moldovan", "language model", " AI ", "openai", "gpt", "gpt-2", "gpt-3", "gpt2", "gpt3", "gpt4",
"gpt-4", "illegal", "harmful", "cannot provide", "yourself or others", "harm to yourself", "cannot suggest", "morals", "ethical",
"cannot answer", "can't answer", "don't know", "no answer", "no response", "i can't", "not enough information", "insufficient",
"it is not possible", "not answerable", "unfortunately", "can't answer", "am not sure", "davinci-0", "ada-0", "babbage-0", "curie-0",
]
# if any of these words are not in the system or prompt, the item will be skipped.
GOOD_WORDS = [
"solve", "calculate", "math", "equation", "formula", "logic", "algebra", "geometry", "riddle", "puzzle", "proof", "theorem",
"problem", "theory", "finance", "economics", "chemistry", "biology", "physics", "science", "history", "geography",
"philosophy", "psychology", "sociology", "computer", "programming", "technology", "engineering", "medicine", "health",
"code", "program", "health", "medical", "doctor", "nurse", "hospital", "disease", "bacteria", "symptom", "cancer",
"diagnosis", "treatment", "procedure", "medicine", "infection", "survival", "therapy", "psychological", "psychiatry",
"summarize", "summarized", "find the", "result", "title", "author", "abstract", "conclusion", "research", "upon a time",
"to whom it may", "subject:", "title:", "from:", "date:", "invoice", "recipe", "life pro tip", "tweet", "a story", "a poem",
"short story", "article", "essay",
]
TOTAL_ITEMS = 100000
# all datasets used and the percentage/ratio of each from the total.
DATASETS = {
"meta-math/MetaMathQA": {
"ratio": 0.3, "set": "train",
"system": DEFAULT_SYSTEM_PROMPT, "prompt": "query", "output": "response",
},
"allenai/ultrafeedback_binarized_cleaned": {
"ratio": 0.3, "set": "train_sft",
"system": DEFAULT_SYSTEM_PROMPT, "prompt": "prompt", "output": "get_assistant(chosen)",
},
"Open-Orca/OpenOrca": {
"ratio": 0.4, "set": "train",
"system": "system_prompt", "prompt": "question", "output": "response",
},
}
MAX_CHAR_LENGTH = 4096
```
提供机构:
perlthoughts
原始信息汇总
数据集概述
默认系统提示
- 默认系统提示: "Below is an instruction that describes a task. Write a response that appropriately completes the request."
过滤词
- 不良词汇: 包含多种语言名称、AI相关词汇以及一些负面词汇。
- 良好词汇: 包含与数学、科学、编程等领域相关的词汇。
数据集组成
- 总项目数: 100,000
- 数据集详情:
- meta-math/MetaMathQA: 占比30%,用于训练集,系统提示为默认系统提示,输入为"query",输出为"response"。
- allenai/ultrafeedback_binarized_cleaned: 占比30%,用于训练集,系统提示为默认系统提示,输入为"prompt",输出为"get_assistant(chosen)"。
- Open-Orca/OpenOrca: 占比40%,用于训练集,系统提示为"system_prompt",输入为"question",输出为"response"。
最大字符长度
- 最大字符长度: 4096



