iNeil77/the-stack-dedup-filtered
收藏Hugging Face2024-09-05 更新2026-03-29 收录
下载链接:
https://hf-mirror.com/datasets/iNeil77/the-stack-dedup-filtered
下载链接
链接失效反馈官方服务:
资源简介:
---
dataset_info:
- config_name: arduino
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 651231434
num_examples: 137883
download_size: 292688119
dataset_size: 651231434
- config_name: batchfile
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 318712023
num_examples: 229135
download_size: 165483150
dataset_size: 318712023
- config_name: c
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: float64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: float64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: float64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 51146944061
num_examples: 8152314
download_size: 19753146402
dataset_size: 51146944061
- config_name: c-sharp
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: float64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: float64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: float64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 38988569615
num_examples: 9811753
download_size: 14190088400
dataset_size: 38988569615
- config_name: clojure
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 346730986
num_examples: 87472
download_size: 145209682
dataset_size: 346730986
- config_name: cmake
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 442854338
num_examples: 174636
download_size: 186216640
dataset_size: 442854338
- config_name: coffeescript
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 621721291
num_examples: 201828
download_size: 275100437
dataset_size: 621721291
- config_name: common-lisp
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 702736669
num_examples: 74124
download_size: 229167353
dataset_size: 702736669
- config_name: cpp
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: float64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: float64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: float64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 42856646485
num_examples: 5917320
download_size: 15819940913
dataset_size: 42856646485
- config_name: crystal
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 214248579
num_examples: 74891
download_size: 85420950
dataset_size: 214248579
- config_name: css
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 9298490680
num_examples: 2559141
download_size: 3379794560
dataset_size: 9298490680
- config_name: cuda
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 489087016
num_examples: 54108
download_size: 169218252
dataset_size: 489087016
- config_name: dart
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 3538079989
num_examples: 877410
download_size: 1326896131
dataset_size: 3538079989
- config_name: elixir
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: float64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: float64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: float64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 532989366
num_examples: 173746
download_size: 222392772
dataset_size: 532989366
- config_name: emacs-lisp
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 388817213
num_examples: 50104
download_size: 159071578
dataset_size: 388817213
- config_name: erlang
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 665329691
num_examples: 86001
download_size: 232762543
dataset_size: 665329691
- config_name: f-sharp
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 479576256
num_examples: 100042
download_size: 183307047
dataset_size: 479576256
- config_name: fortran
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 1646069603
num_examples: 140215
download_size: 568253668
dataset_size: 1646069603
- config_name: glsl
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 136728450
num_examples: 60125
download_size: 64879573
dataset_size: 136728450
- config_name: go
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 21130443168
num_examples: 4508683
download_size: 8277562386
dataset_size: 21130443168
- config_name: groovy
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 912808847
num_examples: 237452
download_size: 353585089
dataset_size: 912808847
- config_name: haml
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 156076234
num_examples: 104007
download_size: 75459381
dataset_size: 156076234
- config_name: haskell
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 2203331919
num_examples: 516563
download_size: 961995722
dataset_size: 2203331919
- config_name: html
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 20508625876
num_examples: 5097859
download_size: 8132523428
dataset_size: 20508625876
- config_name: html_django
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 91338537
num_examples: 41568
download_size: 40463434
dataset_size: 91338537
- config_name: html_erb
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 802279383
num_examples: 469038
download_size: 395181246
dataset_size: 802279383
- config_name: html_php
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 146948751
num_examples: 54676
download_size: 59597370
dataset_size: 146948751
- config_name: java
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 86849509573
num_examples: 19298266
download_size: 33463900644
dataset_size: 86849509573
- config_name: javascript
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 57287198038
num_examples: 18046757
download_size: 24850402926
dataset_size: 57287198038
- config_name: json
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 56090161856
num_examples: 7083359
download_size: 16308194296
dataset_size: 56090161856
- config_name: julia
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 1189419932
num_examples: 275116
download_size: 510947191
dataset_size: 1189419932
- config_name: kotlin
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 6746535622
num_examples: 2132571
download_size: 2841440543
dataset_size: 6746535622
- config_name: llvm
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 342176010
num_examples: 48545
download_size: 82272066
dataset_size: 342176010
- config_name: lua
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 2488019827
num_examples: 512331
download_size: 997169276
dataset_size: 2488019827
- config_name: markdown
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 27390927092
num_examples: 12634045
download_size: 15382946627
dataset_size: 27390927092
- config_name: objective-cpp
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 492610705
num_examples: 58428
download_size: 194504456
dataset_size: 492610705
- config_name: ocaml
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 843316635
num_examples: 116413
download_size: 315600762
dataset_size: 843316635
- config_name: pascal
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 953968201
num_examples: 86216
download_size: 315318520
dataset_size: 953968201
- config_name: perl
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 708174738
num_examples: 120740
download_size: 296558006
dataset_size: 708174738
- config_name: php
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 42690727436
num_examples: 13012864
download_size: 17080289867
dataset_size: 42690727436
- config_name: powershell
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 794963039
num_examples: 226675
download_size: 341300365
dataset_size: 794963039
- config_name: python
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 59551242060
num_examples: 12277991
download_size: 24648727288
dataset_size: 59551242060
- config_name: ruby
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 7673794665
num_examples: 3154108
download_size: 3490052262
dataset_size: 7673794665
- config_name: rust
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 8306276563
num_examples: 1309849
download_size: 2941572461
dataset_size: 8306276563
- config_name: sass
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 194252456
num_examples: 97251
download_size: 84681982
dataset_size: 194252456
- config_name: scala
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 4857796144
num_examples: 1285280
download_size: 1972838999
dataset_size: 4857796144
- config_name: scss
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 4381363366
num_examples: 1976732
download_size: 1890473646
dataset_size: 4381363366
- config_name: shell
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 3594070843
num_examples: 2059118
download_size: 1910975123
dataset_size: 3594070843
- config_name: smalltalk
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 819969243
num_examples: 573413
download_size: 336194442
dataset_size: 819969243
- config_name: smarty
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 285175411
num_examples: 108090
download_size: 116900898
dataset_size: 285175411
- config_name: solidity
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 858034131
num_examples: 146765
download_size: 313402558
dataset_size: 858034131
- config_name: sql
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 3204957943
num_examples: 807712
download_size: 1158941259
dataset_size: 3204957943
- config_name: swift
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 5737114063
num_examples: 1629427
download_size: 2312932533
dataset_size: 5737114063
- config_name: tex
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 1421867921
num_examples: 239992
download_size: 673257593
dataset_size: 1421867921
- config_name: toml
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 509628141
num_examples: 406890
download_size: 267759373
dataset_size: 509628141
- config_name: twig
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 593823418
num_examples: 252925
download_size: 250911166
dataset_size: 593823418
- config_name: typescript
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 23076425795
num_examples: 8211503
download_size: 9976159487
dataset_size: 23076425795
- config_name: visual-basic
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 619538396
num_examples: 97402
download_size: 185394542
dataset_size: 619538396
- config_name: vue
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 5558614502
num_examples: 1316802
download_size: 2277684369
dataset_size: 5558614502
- config_name: yaml
features:
- name: hexsha
dtype: string
- name: size
dtype: int64
- name: ext
dtype: string
- name: lang
dtype: string
- name: max_stars_repo_path
dtype: string
- name: max_stars_repo_name
dtype: string
- name: max_stars_repo_head_hexsha
dtype: string
- name: max_stars_repo_licenses
sequence: string
- name: max_stars_count
dtype: int64
- name: max_stars_repo_stars_event_min_datetime
dtype: string
- name: max_stars_repo_stars_event_max_datetime
dtype: string
- name: max_issues_repo_path
dtype: string
- name: max_issues_repo_name
dtype: string
- name: max_issues_repo_head_hexsha
dtype: string
- name: max_issues_repo_licenses
sequence: string
- name: max_issues_count
dtype: int64
- name: max_issues_repo_issues_event_min_datetime
dtype: string
- name: max_issues_repo_issues_event_max_datetime
dtype: string
- name: max_forks_repo_path
dtype: string
- name: max_forks_repo_name
dtype: string
- name: max_forks_repo_head_hexsha
dtype: string
- name: max_forks_repo_licenses
sequence: string
- name: max_forks_count
dtype: int64
- name: max_forks_repo_forks_event_min_datetime
dtype: string
- name: max_forks_repo_forks_event_max_datetime
dtype: string
- name: content
dtype: string
- name: avg_line_length
dtype: float64
- name: max_line_length
dtype: int64
- name: alphanum_fraction
dtype: float64
- name: retain
dtype: bool
- name: num_tokens
dtype: int64
splits:
- name: train
num_bytes: 13316181015
num_examples: 4733381
download_size: 5846325993
dataset_size: 13316181015
configs:
- config_name: arduino
data_files:
- split: train
path: arduino/train-*
- config_name: batchfile
data_files:
- split: train
path: batchfile/train-*
- config_name: c
data_files:
- split: train
path: c/train-*
- config_name: c-sharp
data_files:
- split: train
path: c-sharp/train-*
- config_name: clojure
data_files:
- split: train
path: clojure/train-*
- config_name: cmake
data_files:
- split: train
path: cmake/train-*
- config_name: coffeescript
data_files:
- split: train
path: coffeescript/train-*
- config_name: common-lisp
data_files:
- split: train
path: common-lisp/train-*
- config_name: cpp
data_files:
- split: train
path: cpp/train-*
- config_name: crystal
data_files:
- split: train
path: crystal/train-*
- config_name: css
data_files:
- split: train
path: css/train-*
- config_name: cuda
data_files:
- split: train
path: cuda/train-*
- config_name: dart
data_files:
- split: train
path: dart/train-*
- config_name: elixir
data_files:
- split: train
path: elixir/train-*
- config_name: emacs-lisp
data_files:
- split: train
path: emacs-lisp/train-*
- config_name: erlang
data_files:
- split: train
path: erlang/train-*
- config_name: f-sharp
data_files:
- split: train
path: f-sharp/train-*
- config_name: fortran
data_files:
- split: train
path: fortran/train-*
- config_name: glsl
data_files:
- split: train
path: glsl/train-*
- config_name: go
data_files:
- split: train
path: go/train-*
- config_name: groovy
data_files:
- split: train
path: groovy/train-*
- config_name: haml
data_files:
- split: train
path: haml/train-*
- config_name: haskell
data_files:
- split: train
path: haskell/train-*
- config_name: html
data_files:
- split: train
path: html/train-*
- config_name: html_django
data_files:
- split: train
path: html_django/train-*
- config_name: html_erb
data_files:
- split: train
path: html_erb/train-*
- config_name: html_php
data_files:
- split: train
path: html_php/train-*
- config_name: java
data_files:
- split: train
path: java/train-*
- config_name: javascript
data_files:
- split: train
path: javascript/train-*
- config_name: json
data_files:
- split: train
path: json/train-*
- config_name: julia
data_files:
- split: train
path: julia/train-*
- config_name: kotlin
data_files:
- split: train
path: kotlin/train-*
- config_name: llvm
data_files:
- split: train
path: llvm/train-*
- config_name: lua
data_files:
- split: train
path: lua/train-*
- config_name: markdown
data_files:
- split: train
path: markdown/train-*
- config_name: objective-cpp
data_files:
- split: train
path: objective-cpp/train-*
- config_name: ocaml
data_files:
- split: train
path: ocaml/train-*
- config_name: pascal
data_files:
- split: train
path: pascal/train-*
- config_name: perl
data_files:
- split: train
path: perl/train-*
- config_name: php
data_files:
- split: train
path: php/train-*
- config_name: powershell
data_files:
- split: train
path: powershell/train-*
- config_name: python
data_files:
- split: train
path: python/train-*
- config_name: ruby
data_files:
- split: train
path: ruby/train-*
- config_name: rust
data_files:
- split: train
path: rust/train-*
- config_name: sass
data_files:
- split: train
path: sass/train-*
- config_name: scala
data_files:
- split: train
path: scala/train-*
- config_name: scss
data_files:
- split: train
path: scss/train-*
- config_name: shell
data_files:
- split: train
path: shell/train-*
- config_name: smalltalk
data_files:
- split: train
path: smalltalk/train-*
- config_name: smarty
data_files:
- split: train
path: smarty/train-*
- config_name: solidity
data_files:
- split: train
path: solidity/train-*
- config_name: sql
data_files:
- split: train
path: sql/train-*
- config_name: swift
data_files:
- split: train
path: swift/train-*
- config_name: tex
data_files:
- split: train
path: tex/train-*
- config_name: toml
data_files:
- split: train
path: toml/train-*
- config_name: twig
data_files:
- split: train
path: twig/train-*
- config_name: typescript
data_files:
- split: train
path: typescript/train-*
- config_name: visual-basic
data_files:
- split: train
path: visual-basic/train-*
- config_name: vue
data_files:
- split: train
path: vue/train-*
- config_name: yaml
data_files:
- split: train
path: yaml/train-*
---
This is a filtered version of the near-deduped `bigcode/the-stack-dedup` dataset. We further apply the following filters:
1. For files forked more than 25 times, we retain them if the average line length is less than 140, the maximum line length is less than 500 and the alphanumeric fraction is more than 25%.
2. For files forked between 10 and 25 times, we retain them if the average line length is less than 120, the maximum line length is less than 200 and the alphanumeric fraction is more than 35%.
3. For files forked less than 10 times, we retain them if the average line length is less than 100, the maximum line length is less than 200 and the alphanumeric fraction is more than 40%.
4. We only retain language splits that still have more than 50,000 samples after the above filtering steps, with the exception of `llvm`, which we also retain. This leaves us with 60 languages.
5. We only retain samples from conventionally used extensions and drop samples from valid but uncommon extensions. We select the following extensions for the 60 languages:
ext_map = {
"ASP":
[
".asp",
],
"Arduino":
[
".ino"
],
"AsciiDoc":
[
".asciidoc",
],
"Batchfile":
[
".bat",
".cmd"
],
"C":
[
".c",
".h",
],
"C#":
[
".cs",
],
"C++":
[
".cpp",
".c++",
".cc",
".cxx",
".h++",
".hh",
".hpp",
".hxx",
],
"CMake":
[
".cmake",
],
"CSS":
[
".css"
],
"Clojure":
[
".clj",
],
"CoffeeScript":
[
".coffee",
],
"Common Lisp":
[
".lisp",
],
"Crystal":
[
".cr"
],
"Cuda":
[
".cu",
".cuh"
],
"Dart":
[
".dart"
],
"Dockerfile":
[
".dockerfile",
"Dockerfile"
],
"Elixir":
[
".ex",
],
"Emacs Lisp":
[
".el",
".emacs",
],
"Erlang":
[
".erl",
],
"F#":
[
".fs",
],
"FORTRAN":
[
".f90",
".f",
".f03",
".f08",
".f77",
".f95",
],
"GLSL":
[
".glsl",
],
"Go":
[
".go"
],
"Groovy":
[
".groovy",
],
"HCL":
[
".hcl",
],
"HTML":
[
".html",
],
"HTML+Django":
[
".mustache",
".jinja"
],
"HTML+ERB":
[
".erb",
],
"HTML+PHP":
[
".phtml"
],
"Haml":
[
".haml",
],
"Haskell":
[
".hs",
],
"JSON":
[
".json",
],
"Java":
[
".java"
],
"JavaScript":
[
".js",
],
"Julia":
[
".jl"
],
"Kotlin":
[
".kt",
],
"LLVM":
[
".ll"
],
"Lua":
[
".lua",
],
"Makefile":
[
"Makefile"
],
"Markdown":
[
".md",
".markdown",
],
"OCaml":
[
".ml",
],
"Objective-C++":
[
".mm"
],
"PHP":
[
".php",
".php3",
".php4",
".php5",
],
"Pascal":
[
".pas",
],
"Perl":
[
".pl",
],
"PowerShell":
[
".ps1",
".psm1"
],
"Python":
[
".py",
],
"R":
[
".r",
],
"Ruby":
[
".rb",
],
"Rust":
[
".rs",
],
"SCSS":
[
".scss"
],
"SQL":
[
".plsql",
".sql",
".ddl",
],
"Sass":
[
".sass"
],
"Scala":
[
".scala",
".sbt"
],
"Shell":
[
".sh",
".bash",
".zsh"
],
"Smalltalk":
[
".st"
],
"Smarty":
[
".tpl"
],
"Solidity":
[
".sol"
],
"Swift":
[
".swift"
],
"TOML":
[
".toml"
],
"TeX":
[
".tex",
".sty",
],
"Twig":
[
".twig"
],
"TypeScript":
[
".ts",
],
"Visual Basic":
[
".vb",
],
"Vue":
[
".vue"
],
"YAML":
[
".yml",
".yaml",
],
}
Below, we provide the language-wise token-count breakdown of the dataset as counted by the `meta-llama/Meta-Llama-3-8B` tokenizer (the full dataset is around 144 Billion tokens):
| **Language** | **Token\_Count** |
|----------------|------------------|
| Json | 18,349,014,478 |
| Java | 16,249,736,121 |
| C | 14,381,796,257 |
| Python | 12,868,526,213 |
| Javascript | 11,780,759,219 |
| Cpp | 10,912,556,820 |
| Php | 8,934,867,699 |
| C\-Sharp | 6,784,433,504 |
| Markdown | 5,614,520,807 |
| Go | 5,561,257,406 |
| Html | 4,956,588,993 |
| Typescript | 4,437,582,117 |
| Yaml | 3,464,120,199 |
| Css | 2,472,231,309 |
| Rust | 1,932,035,717 |
| Ruby | 1,517,724,917 |
| Vue | 1,188,684,557 |
| Kotlin | 1,136,507,152 |
| Swift | 1,051,682,820 |
| Scala | 979,598,470 |
| Scss | 954,556,739 |
| Sql | 910,548,728 |
| Shell | 735,010,998 |
| Dart | 697,568,091 |
| Lua | 650,818,157 |
| Haskell | 528,482,496 |
| Fortran | 524,261,306 |
| Tex | 429,010,699 |
| Julia | 322,241,716 |
| Pascal | 245,237,366 |
| Ocaml | 223,652,453 |
| Perl | 215,973,238 |
| Solidity | 212,647,529 |
| Common\-Lisp | 177,139,498 |
| Powershell | 172,556,068 |
| Erlang | 171,211,323 |
| Arduino | 166,862,916 |
| Groovy | 166,678,846 |
| Objective\-Cpp | 158,210,523 |
| Html\_erb | 145,488,020 |
| Llvm | 135,137,323 |
| Cuda | 131,054,442 |
| Coffeescript | 129,999,078 |
| Smalltalk | 124,718,870 |
| Visual\-Basic | 121,466,641 |
| Elixir | 119,466,430 |
| Twig | 106,160,625 |
| F\-Sharp | 104,922,127 |
| Toml | 101,217,092 |
| Emacs\-Lisp | 94,724,490 |
| Cmake | 89,707,698 |
| Clojure | 79,005,141 |
| Batchfile | 62,657,884 |
| Smarty | 61,937,213 |
| Crystal | 47,220,882 |
| Sass | 43,661,923 |
| Glsl | 32,127,331 |
| Html\_php | 29,231,389 |
| Haml | 25,043,854 |
| Html\_django | 16,596,017 |
| **Total** | 144,038,437,935 |
提供机构:
iNeil77



