pharaouk/SkunkData-Corpus-Clusters
收藏Hugging Face2023-09-15 更新2024-03-04 收录
下载链接:
https://hf-mirror.com/datasets/pharaouk/SkunkData-Corpus-Clusters
下载链接
链接失效反馈官方服务:
资源简介:
---
configs:
- config_name: default
data_files:
- split: orca_0
path: data/orca_0-*
- split: instruct_0
path: data/instruct_0-*
- split: orca_1
path: data/orca_1-*
- split: instruct_1
path: data/instruct_1-*
- split: orca_2
path: data/orca_2-*
- split: instruct_2
path: data/instruct_2-*
- split: orca_3
path: data/orca_3-*
- split: instruct_3
path: data/instruct_3-*
- split: orca_4
path: data/orca_4-*
- split: instruct_4
path: data/instruct_4-*
- split: orca_5
path: data/orca_5-*
- split: instruct_5
path: data/instruct_5-*
- split: orca_6
path: data/orca_6-*
- split: instruct_6
path: data/instruct_6-*
- split: orca_7
path: data/orca_7-*
- split: instruct_7
path: data/instruct_7-*
- split: orca_8
path: data/orca_8-*
- split: instruct_8
path: data/instruct_8-*
- split: orca_9
path: data/orca_9-*
- split: instruct_9
path: data/instruct_9-*
- split: orca_10
path: data/orca_10-*
- split: instruct_10
path: data/instruct_10-*
- split: orca_11
path: data/orca_11-*
- split: instruct_11
path: data/instruct_11-*
- split: orca_12
path: data/orca_12-*
- split: instruct_12
path: data/instruct_12-*
- split: orca_13
path: data/orca_13-*
- split: instruct_13
path: data/instruct_13-*
- split: orca_14
path: data/orca_14-*
- split: instruct_14
path: data/instruct_14-*
- split: orca_15
path: data/orca_15-*
- split: instruct_15
path: data/instruct_15-*
- split: orca_16
path: data/orca_16-*
- split: instruct_16
path: data/instruct_16-*
- split: orca_17
path: data/orca_17-*
- split: instruct_17
path: data/instruct_17-*
- split: orca_18
path: data/orca_18-*
- split: instruct_18
path: data/instruct_18-*
- split: orca_19
path: data/orca_19-*
- split: instruct_19
path: data/instruct_19-*
- split: orca_20
path: data/orca_20-*
- split: instruct_20
path: data/instruct_20-*
- split: orca_21
path: data/orca_21-*
- split: instruct_21
path: data/instruct_21-*
- split: orca_22
path: data/orca_22-*
- split: instruct_22
path: data/instruct_22-*
- split: orca_23
path: data/orca_23-*
- split: instruct_23
path: data/instruct_23-*
- split: orca_24
path: data/orca_24-*
- split: instruct_24
path: data/instruct_24-*
- split: orca_25
path: data/orca_25-*
- split: instruct_25
path: data/instruct_25-*
- split: orca_26
path: data/orca_26-*
- split: instruct_26
path: data/instruct_26-*
- split: orca_27
path: data/orca_27-*
- split: instruct_27
path: data/instruct_27-*
- split: orca_28
path: data/orca_28-*
- split: instruct_28
path: data/instruct_28-*
- split: orca_29
path: data/orca_29-*
- split: instruct_29
path: data/instruct_29-*
- split: orca_30
path: data/orca_30-*
- split: instruct_30
path: data/instruct_30-*
- split: orca_31
path: data/orca_31-*
- split: instruct_31
path: data/instruct_31-*
dataset_info:
features:
- name: message
dtype: string
- name: message_type
dtype: string
- name: message_id
dtype: int64
- name: conversation_id
dtype: int64
- name: dataset_id
dtype: string
- name: unique_conversation_id
dtype: string
- name: cluster
dtype: float64
- name: __index_level_0__
dtype: int64
splits:
- name: orca_0
num_bytes: 17849715
num_examples: 18401
- name: instruct_0
num_bytes: 70074569
num_examples: 81024
- name: orca_1
num_bytes: 23680133
num_examples: 28584
- name: instruct_1
num_bytes: 82931087
num_examples: 96749
- name: orca_2
num_bytes: 19980410
num_examples: 17412
- name: instruct_2
num_bytes: 154000003
num_examples: 124814
- name: orca_3
num_bytes: 17101778
num_examples: 32038
- name: instruct_3
num_bytes: 49883928
num_examples: 63327
- name: orca_4
num_bytes: 31656753
num_examples: 34675
- name: instruct_4
num_bytes: 127695479
num_examples: 126005
- name: orca_5
num_bytes: 16269511
num_examples: 14092
- name: instruct_5
num_bytes: 61398228
num_examples: 59076
- name: orca_6
num_bytes: 1342860
num_examples: 2388
- name: instruct_6
num_bytes: 48450814
num_examples: 66011
- name: orca_7
num_bytes: 44849080
num_examples: 36172
- name: instruct_7
num_bytes: 65892068
num_examples: 59876
- name: orca_8
num_bytes: 19352268
num_examples: 18871
- name: instruct_8
num_bytes: 227627947
num_examples: 170841
- name: orca_9
num_bytes: 14700372
num_examples: 15315
- name: instruct_9
num_bytes: 64004683
num_examples: 60637
- name: orca_10
num_bytes: 508915
num_examples: 1446
- name: instruct_10
num_bytes: 24081225
num_examples: 48031
- name: orca_11
num_bytes: 19443068
num_examples: 19745
- name: instruct_11
num_bytes: 82438320
num_examples: 80868
- name: orca_12
num_bytes: 4848059
num_examples: 7172
- name: instruct_12
num_bytes: 166293672
num_examples: 182113
- name: orca_13
num_bytes: 10599648
num_examples: 19167
- name: instruct_13
num_bytes: 84060226
num_examples: 152834
- name: orca_14
num_bytes: 15987021
num_examples: 24048
- name: instruct_14
num_bytes: 59454799
num_examples: 91972
- name: orca_15
num_bytes: 23903599
num_examples: 24410
- name: instruct_15
num_bytes: 85555445
num_examples: 84953
- name: orca_16
num_bytes: 23154299
num_examples: 19289
- name: instruct_16
num_bytes: 101140401
num_examples: 90731
- name: orca_17
num_bytes: 2152082
num_examples: 3809
- name: instruct_17
num_bytes: 66472234
num_examples: 80386
- name: orca_18
num_bytes: 83273007
num_examples: 45544
- name: instruct_18
num_bytes: 110961860
num_examples: 80604
- name: orca_19
num_bytes: 1386401
num_examples: 1644
- name: instruct_19
num_bytes: 37424277
num_examples: 42630
- name: orca_20
num_bytes: 15212013
num_examples: 14602
- name: instruct_20
num_bytes: 94216681
num_examples: 77830
- name: orca_21
num_bytes: 3440922
num_examples: 4174
- name: instruct_21
num_bytes: 124095838
num_examples: 87012
- name: orca_22
num_bytes: 11468080
num_examples: 14191
- name: instruct_22
num_bytes: 63633991
num_examples: 78980
- name: orca_23
num_bytes: 3591049
num_examples: 3778
- name: instruct_23
num_bytes: 95699355
num_examples: 69680
- name: orca_24
num_bytes: 1309953
num_examples: 2395
- name: instruct_24
num_bytes: 82548064
num_examples: 92642
- name: orca_25
num_bytes: 20598114
num_examples: 18715
- name: instruct_25
num_bytes: 132539502
num_examples: 99843
- name: orca_26
num_bytes: 31638864
num_examples: 65463
- name: instruct_26
num_bytes: 52624322
num_examples: 81968
- name: orca_27
num_bytes: 3056079
num_examples: 5939
- name: instruct_27
num_bytes: 29071432
num_examples: 55864
- name: orca_28
num_bytes: 12158143
num_examples: 16039
- name: instruct_28
num_bytes: 67326019
num_examples: 84243
- name: orca_29
num_bytes: 33228880
num_examples: 65846
- name: instruct_29
num_bytes: 16788126
num_examples: 21536
- name: orca_30
num_bytes: 1580412
num_examples: 1991
- name: instruct_30
num_bytes: 15819978
num_examples: 29766
- name: orca_31
num_bytes: 6719191
num_examples: 11269
- name: instruct_31
num_bytes: 29009522
num_examples: 47163
download_size: 1412051638
dataset_size: 3109254774
---
# Dataset Card for "SkunkData-Corpus-Clusters"
[More Information needed](https://github.com/huggingface/datasets/blob/main/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
配置:
- 配置名称:default
数据文件:
- 拆分:orca_0
路径:data/orca_0-*
- 拆分:instruct_0
路径:data/instruct_0-*
- 拆分:orca_1
路径:data/orca_1-*
- 拆分:instruct_1
路径:data/instruct_1-*
- 拆分:orca_2
路径:data/orca_2-*
- 拆分:instruct_2
路径:data/instruct_2-*
- 拆分:orca_3
路径:data/orca_3-*
- 拆分:instruct_3
路径:data/instruct_3-*
- 拆分:orca_4
路径:data/orca_4-*
- 拆分:instruct_4
路径:data/instruct_4-*
- 拆分:orca_5
路径:data/orca_5-*
- 拆分:instruct_5
路径:data/instruct_5-*
- 拆分:orca_6
路径:data/orca_6-*
- 拆分:instruct_6
路径:data/instruct_6-*
- 拆分:orca_7
路径:data/orca_7-*
- 拆分:instruct_7
路径:data/instruct_7-*
- 拆分:orca_8
路径:data/orca_8-*
- 拆分:instruct_8
路径:data/instruct_8-*
- 拆分:orca_9
路径:data/orca_9-*
- 拆分:instruct_9
路径:data/instruct_9-*
- 拆分:orca_10
路径:data/orca_10-*
- 拆分:instruct_10
路径:data/instruct_10-*
- 拆分:orca_11
路径:data/orca_11-*
- 拆分:instruct_11
路径:data/instruct_11-*
- 拆分:orca_12
路径:data/orca_12-*
- 拆分:instruct_12
路径:data/instruct_12-*
- 拆分:orca_13
路径:data/orca_13-*
- 拆分:instruct_13
路径:data/instruct_13-*
- 拆分:orca_14
路径:data/orca_14-*
- 拆分:instruct_14
路径:data/instruct_14-*
- 拆分:orca_15
路径:data/orca_15-*
- 拆分:instruct_15
路径:data/instruct_15-*
- 拆分:orca_16
路径:data/orca_16-*
- 拆分:instruct_16
路径:data/instruct_16-*
- 拆分:orca_17
路径:data/orca_17-*
- 拆分:instruct_17
路径:data/instruct_17-*
- 拆分:orca_18
路径:data/orca_18-*
- 拆分:instruct_18
路径:data/instruct_18-*
- 拆分:orca_19
路径:data/orca_19-*
- 拆分:instruct_19
路径:data/instruct_19-*
- 拆分:orca_20
路径:data/orca_20-*
- 拆分:instruct_20
路径:data/instruct_20-*
- 拆分:orca_21
路径:data/orca_21-*
- 拆分:instruct_21
路径:data/instruct_21-*
- 拆分:orca_22
路径:data/orca_22-*
- 拆分:instruct_22
路径:data/instruct_22-*
- 拆分:orca_23
路径:data/orca_23-*
- 拆分:instruct_23
路径:data/instruct_23-*
- 拆分:orca_24
路径:data/orca_24-*
- 拆分:instruct_24
路径:data/instruct_24-*
- 拆分:orca_25
路径:data/orca_25-*
- 拆分:instruct_25
路径:data/instruct_25-*
- 拆分:orca_26
路径:data/orca_26-*
- 拆分:instruct_26
路径:data/instruct_26-*
- 拆分:orca_27
路径:data/orca_27-*
- 拆分:instruct_27
路径:data/instruct_27-*
- 拆分:orca_28
路径:data/orca_28-*
- 拆分:instruct_28
路径:data/instruct_28-*
- 拆分:orca_29
路径:data/orca_29-*
- 拆分:instruct_29
路径:data/instruct_29-*
- 拆分:orca_30
路径:data/orca_30-*
- 拆分:instruct_30
路径:data/instruct_30-*
- 拆分:orca_31
路径:data/orca_31-*
- 拆分:instruct_31
路径:data/instruct_31-*
数据集信息:
特征:
- 名称:message
数据类型:string
- 名称:message_type
数据类型:string
- 名称:message_id
数据类型:int64
- 名称:conversation_id
数据类型:int64
- 名称:dataset_id
数据类型:string
- 名称:unique_conversation_id
数据类型:string
- 名称:cluster
数据类型:float64
- 名称:__index_level_0__
数据类型:int64
拆分:
- 名称:orca_0
字节数:17849715
样本数:18401
- 名称:instruct_0
字节数:70074569
样本数:81024
- 名称:orca_1
字节数:23680133
样本数:28584
- 名称:instruct_1
字节数:82931087
样本数:96749
- 名称:orca_2
字节数:19980410
样本数:17412
- 名称:instruct_2
字节数:154000003
样本数:124814
- 名称:orca_3
字节数:17101778
样本数:32038
- 名称:instruct_3
字节数:49883928
样本数:63327
- 名称:orca_4
字节数:31656753
样本数:34675
- 名称:instruct_4
字节数:127695479
样本数:126005
- 名称:orca_5
字节数:16269511
样本数:14092
- 名称:instruct_5
字节数:61398228
样本数:59076
- 名称:orca_6
字节数:1342860
样本数:2388
- 名称:instruct_6
字节数:48450814
样本数:66011
- 名称:orca_7
字节数:44849080
样本数:36172
- 名称:instruct_7
字节数:65892068
样本数:59876
- 名称:orca_8
字节数:19352268
样本数:18871
- 名称:instruct_8
字节数:227627947
样本数:170841
- 名称:orca_9
字节数:14700372
样本数:15315
- 名称:instruct_9
字节数:64004683
样本数:60637
- 名称:orca_10
字节数:508915
样本数:1446
- 名称:instruct_10
字节数:24081225
样本数:48031
- 名称:orca_11
字节数:19443068
样本数:19745
- 名称:instruct_11
字节数:82438320
样本数:80868
- 名称:orca_12
字节数:4848059
样本数:7172
- 名称:instruct_12
字节数:166293672
样本数:182113
- 名称:orca_13
字节数:10599648
样本数:19167
- 名称:instruct_13
字节数:84060226
样本数:152834
- 名称:orca_14
字节数:15987021
样本数:24048
- 名称:instruct_14
字节数:59454799
样本数:91972
- 名称:orca_15
字节数:23903599
样本数:24410
- 名称:instruct_15
字节数:85555445
样本数:84953
- 名称:orca_16
字节数:23154299
样本数:19289
- 名称:instruct_16
字节数:101140401
样本数:90731
- 名称:orca_17
字节数:2152082
样本数:3809
- 名称:instruct_17
字节数:66472234
样本数:80386
- 名称:orca_18
字节数:83273007
样本数:45544
- 名称:instruct_18
字节数:110961860
样本数:80604
- 名称:orca_19
字节数:1386401
样本数:1644
- 名称:instruct_19
字节数:37424277
样本数:42630
- 名称:orca_20
字节数:15212013
样本数:14602
- 名称:instruct_20
字节数:94216681
样本数:77830
- 名称:orca_21
字节数:3440922
样本数:4174
- 名称:instruct_21
字节数:124095838
样本数:87012
- 名称:orca_22
字节数:11468080
样本数:14191
- 名称:instruct_22
字节数:63633991
样本数:78980
- 名称:orca_23
字节数:3591049
样本数:3778
- 名称:instruct_23
字节数:95699355
样本数:69680
- 名称:orca_24
字节数:1309953
样本数:2395
- 名称:instruct_24
字节数:82548064
样本数:92642
- 名称:orca_25
字节数:20598114
样本数:18715
- 名称:instruct_25
字节数:132539502
样本数:99843
- 名称:orca_26
字节数:31638864
样本数:65463
- 名称:instruct_26
字节数:52624322
样本数:81968
- 名称:orca_27
字节数:3056079
样本数:5939
- 名称:instruct_27
字节数:29071432
样本数:55864
- 名称:orca_28
字节数:12158143
样本数:16039
- 名称:instruct_28
字节数:67326019
样本数:84243
- 名称:orca_29
字节数:33228880
样本数:65846
- 名称:instruct_29
字节数:16788126
样本数:21536
- 名称:orca_30
字节数:1580412
样本数:1991
- 名称:instruct_30
字节数:15819978
样本数:29766
- 名称:orca_31
字节数:6719191
样本数:11269
- 名称:instruct_31
字节数:29009522
样本数:47163
下载大小:1412051638
数据集大小:3109254774
# SkunkData-Corpus-Clusters数据集卡片
[需补充更多信息](https://github.com/huggingface/datasets/blob/main/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
提供机构:
pharaouk
原始信息汇总
数据集概述
数据集配置
- 配置名称: default
- 数据文件路径:
orca_0:data/orca_0-*instruct_0:data/instruct_0-*orca_1:data/orca_1-*instruct_1:data/instruct_1-*orca_2:data/orca_2-*instruct_2:data/instruct_2-*orca_3:data/orca_3-*instruct_3:data/instruct_3-*orca_4:data/orca_4-*instruct_4:data/instruct_4-*orca_5:data/orca_5-*instruct_5:data/instruct_5-*orca_6:data/orca_6-*instruct_6:data/instruct_6-*orca_7:data/orca_7-*instruct_7:data/instruct_7-*orca_8:data/orca_8-*instruct_8:data/instruct_8-*orca_9:data/orca_9-*instruct_9:data/instruct_9-*orca_10:data/orca_10-*instruct_10:data/instruct_10-*orca_11:data/orca_11-*instruct_11:data/instruct_11-*orca_12:data/orca_12-*instruct_12:data/instruct_12-*orca_13:data/orca_13-*instruct_13:data/instruct_13-*orca_14:data/orca_14-*instruct_14:data/instruct_14-*orca_15:data/orca_15-*instruct_15:data/instruct_15-*orca_16:data/orca_16-*instruct_16:data/instruct_16-*orca_17:data/orca_17-*instruct_17:data/instruct_17-*orca_18:data/orca_18-*instruct_18:data/instruct_18-*orca_19:data/orca_19-*instruct_19:data/instruct_19-*orca_20:data/orca_20-*instruct_20:data/instruct_20-*orca_21:data/orca_21-*instruct_21:data/instruct_21-*orca_22:data/orca_22-*instruct_22:data/instruct_22-*orca_23:data/orca_23-*instruct_23:data/instruct_23-*orca_24:data/orca_24-*instruct_24:data/instruct_24-*orca_25:data/orca_25-*instruct_25:data/instruct_25-*orca_26:data/orca_26-*instruct_26:data/instruct_26-*orca_27:data/orca_27-*instruct_27:data/instruct_27-*orca_28:data/orca_28-*instruct_28:data/instruct_28-*orca_29:data/orca_29-*instruct_29:data/instruct_29-*orca_30:data/orca_30-*instruct_30:data/instruct_30-*orca_31:data/orca_31-*instruct_31:data/instruct_31-*
数据集信息
-
特征:
message:stringmessage_type:stringmessage_id:int64conversation_id:int64dataset_id:stringunique_conversation_id:stringcluster:float64__index_level_0__:int64
-
分割:
orca_0:num_bytes: 17849715num_examples: 18401
instruct_0:num_bytes: 70074569num_examples: 81024
orca_1:num_bytes: 23680133num_examples: 28584
instruct_1:num_bytes: 82931087num_examples: 96749
orca_2:num_bytes: 19980410num_examples: 17412
instruct_2:num_bytes: 154000003num_examples: 124814
orca_3:num_bytes: 17101778num_examples: 32038
instruct_3:num_bytes: 49883928num_examples: 63327
orca_4:num_bytes: 31656753num_examples: 34675
instruct_4:num_bytes: 127695479num_examples: 126005
orca_5:num_bytes: 16269511num_examples: 14092
instruct_5:num_bytes: 61398228num_examples: 59076
orca_6:num_bytes: 1342860num_examples: 2388
instruct_6:num_bytes: 48450814num_examples: 66011
orca_7:num_bytes: 44849080num_examples: 36172
instruct_7:num_bytes: 65892068num_examples: 59876
orca_8:num_bytes: 19352268num_examples: 18871
instruct_8:num_bytes: 227627947num_examples: 170841
orca_9:num_bytes: 14700372num_examples: 15315
instruct_9:num_bytes: 64004683num_examples: 60637
orca_10:num_bytes: 508915num_examples: 1446
instruct_10:num_bytes: 24081225num_examples: 48031
orca_11:num_bytes: 19443068num_examples: 19745
instruct_11:num_bytes: 82438320num_examples: 80868
orca_12:num_bytes: 4848059num_examples: 7172
instruct_12:num_bytes: 166293672num_examples: 182113
orca_13:num_bytes: 10599648num_examples: 19167
instruct_13:num_bytes: 84060226num_examples: 152834
orca_14:num_bytes: 15987021num_examples: 24048
instruct_14:num_bytes: 59454799num_examples: 91972
orca_15:num_bytes: 23903599num_examples: 24410
instruct_15:num_bytes: 85555445num_examples: 84953
orca_16:num_bytes: 23154299num_examples: 19289
instruct_16:num_bytes: 101140401num_examples: 90731
orca_17:num_bytes: 2152082num_examples: 3809
instruct_17:num_bytes: 66472234num_examples: 80386
orca_18:num_bytes: 83273007num_examples: 45544
instruct_18:num_bytes: 110961860num_examples: 80604
orca_19:num_bytes: 1386401num_examples: 1644
instruct_19:num_bytes: 37424277num_examples: 42630
orca_20:num_bytes: 15212013num_examples: 14602
instruct_20:num_bytes: 94216681num_examples: 77830
orca_21:num_bytes: 3440922num_examples: 4174
instruct_21:num_bytes: 124095838num_examples: 87012
orca_22:num_bytes: 11468080num_examples: 14191
instruct_22:num_bytes: 63633991num_examples: 78980
orca_23:num_bytes: 3591049num_examples: 3778
instruct_23:num_bytes: 95699355num_examples: 69680
orca_24:num_bytes: 1309953num_examples: 2395
instruct_24:num_bytes: 82548064num_examples: 92642
orca_25:num_bytes: 20598114num_examples: 18715
instruct_25:num_bytes: 132539502num_examples: 99843
orca_26:num_bytes: 31638864num_examples: 65463
instruct_26:num_bytes: 52624322num_examples: 81968
orca_27:num_bytes: 3056079num_examples: 5939
instruct_27:num_bytes: 29071432num_examples: 55864
orca_28:num_bytes: 12158143num_examples: 16039
instruct_28:num_bytes: 67326019num_examples: 84243
orca_29:num_bytes: 33228880num_examples: 65846
instruct_29:num_bytes: 16788126num_examples: 21536
orca_30:num_bytes: 1580412num_examples: 1991
instruct_30:num_bytes: 15819978num_examples: 29766
orca_31:num_bytes: 6719191num_examples: 11269
instruct_31:num_bytes: 29009522num_examples: 47163
-
数据集大小:
download_size: 1412051638dataset_size: 3109254774



