asahi417/seamless-align-enA-hiA.tokenized.encodec
收藏Hugging Face2024-05-31 更新2024-06-12 收录
下载链接:
https://hf-mirror.com/datasets/asahi417/seamless-align-enA-hiA.tokenized.encodec
下载链接
链接失效反馈官方服务:
资源简介:
---
dataset_info:
- config_name: subset_1
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 859474522
num_examples: 2295
download_size: 131903597
dataset_size: 859474522
- config_name: subset_10
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 637582321
num_examples: 2026
download_size: 98842555
dataset_size: 637582321
- config_name: subset_11
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 614079087
num_examples: 1984
download_size: 95072280
dataset_size: 614079087
- config_name: subset_12
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 620366288
num_examples: 2004
download_size: 96086284
dataset_size: 620366288
- config_name: subset_13
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 586303575
num_examples: 1931
download_size: 90799434
dataset_size: 586303575
- config_name: subset_14
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 613901849
num_examples: 1980
download_size: 95121567
dataset_size: 613901849
- config_name: subset_15
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 574423128
num_examples: 1959
download_size: 89117360
dataset_size: 574423128
- config_name: subset_16
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 596620190
num_examples: 2001
download_size: 92601152
dataset_size: 596620190
- config_name: subset_17
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 628994958
num_examples: 2022
download_size: 97646543
dataset_size: 628994958
- config_name: subset_18
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 595646848
num_examples: 1988
download_size: 92399494
dataset_size: 595646848
- config_name: subset_19
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 598336801
num_examples: 1965
download_size: 92871627
dataset_size: 598336801
- config_name: subset_2
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 857803928
num_examples: 2335
download_size: 131765202
dataset_size: 857803928
- config_name: subset_20
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 605729037
num_examples: 1971
download_size: 93811180
dataset_size: 605729037
- config_name: subset_21
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 588502178
num_examples: 1976
download_size: 91360603
dataset_size: 588502178
- config_name: subset_22
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 617611513
num_examples: 2018
download_size: 95808771
dataset_size: 617611513
- config_name: subset_23
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 591465283
num_examples: 1981
download_size: 91839994
dataset_size: 591465283
- config_name: subset_24
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 591406803
num_examples: 1970
download_size: 91935787
dataset_size: 591406803
- config_name: subset_25
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 588220650
num_examples: 1971
download_size: 91312572
dataset_size: 588220650
- config_name: subset_26
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 582406673
num_examples: 1933
download_size: 90433280
dataset_size: 582406673
- config_name: subset_27
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 585242164
num_examples: 1956
download_size: 90776094
dataset_size: 585242164
- config_name: subset_28
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 601211892
num_examples: 1958
download_size: 93294895
dataset_size: 601211892
- config_name: subset_29
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 598070460
num_examples: 1948
download_size: 92906700
dataset_size: 598070460
- config_name: subset_3
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 800322191
num_examples: 2282
download_size: 123119922
dataset_size: 800322191
- config_name: subset_30
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 581940007
num_examples: 1934
download_size: 90318683
dataset_size: 581940007
- config_name: subset_31
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 578517056
num_examples: 1938
download_size: 89679857
dataset_size: 578517056
- config_name: subset_32
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 596220667
num_examples: 1964
download_size: 92598685
dataset_size: 596220667
- config_name: subset_33
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 581990969
num_examples: 1956
download_size: 90471237
dataset_size: 581990969
- config_name: subset_34
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 566628571
num_examples: 1870
download_size: 87920092
dataset_size: 566628571
- config_name: subset_35
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 579084763
num_examples: 1892
download_size: 89934283
dataset_size: 579084763
- config_name: subset_36
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 598188341
num_examples: 1912
download_size: 92842406
dataset_size: 598188341
- config_name: subset_37
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 586972483
num_examples: 1922
download_size: 91070916
dataset_size: 586972483
- config_name: subset_38
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 573889300
num_examples: 1887
download_size: 89156393
dataset_size: 573889300
- config_name: subset_39
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 579346429
num_examples: 1896
download_size: 89893742
dataset_size: 579346429
- config_name: subset_4
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 759132546
num_examples: 2220
download_size: 116971977
dataset_size: 759132546
- config_name: subset_40
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 582550930
num_examples: 1882
download_size: 90441799
dataset_size: 582550930
- config_name: subset_41
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 594695580
num_examples: 1923
download_size: 92369087
dataset_size: 594695580
- config_name: subset_42
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 593854525
num_examples: 1915
download_size: 92230057
dataset_size: 593854525
- config_name: subset_43
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 593056925
num_examples: 1917
download_size: 92077272
dataset_size: 593056925
- config_name: subset_44
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 582315208
num_examples: 1889
download_size: 90476818
dataset_size: 582315208
- config_name: subset_45
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 581930684
num_examples: 1895
download_size: 90460281
dataset_size: 581930684
- config_name: subset_46
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 592109035
num_examples: 1889
download_size: 92016497
dataset_size: 592109035
- config_name: subset_47
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 587549620
num_examples: 1911
download_size: 91281983
dataset_size: 587549620
- config_name: subset_48
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 582072164
num_examples: 1894
download_size: 90473679
dataset_size: 582072164
- config_name: subset_49
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 608218761
num_examples: 1929
download_size: 94486185
dataset_size: 608218761
- config_name: subset_5
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 726065293
num_examples: 2202
download_size: 111837002
dataset_size: 726065293
- config_name: subset_50
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 593606054
num_examples: 1889
download_size: 92238795
dataset_size: 593606054
- config_name: subset_51
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 578238786
num_examples: 1863
download_size: 89866649
dataset_size: 578238786
- config_name: subset_52
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 602457748
num_examples: 1945
download_size: 93620259
dataset_size: 602457748
- config_name: subset_53
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 585448884
num_examples: 1911
download_size: 90939329
dataset_size: 585448884
- config_name: subset_54
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 616552921
num_examples: 1964
download_size: 95778407
dataset_size: 616552921
- config_name: subset_55
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 599501542
num_examples: 1899
download_size: 93041753
dataset_size: 599501542
- config_name: subset_56
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 595489208
num_examples: 1936
download_size: 92526381
dataset_size: 595489208
- config_name: subset_57
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 605586092
num_examples: 1920
download_size: 94117180
dataset_size: 605586092
- config_name: subset_58
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 606761543
num_examples: 1938
download_size: 94241727
dataset_size: 606761543
- config_name: subset_59
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 605661348
num_examples: 1907
download_size: 94101772
dataset_size: 605661348
- config_name: subset_6
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 672870960
num_examples: 2123
download_size: 103926846
dataset_size: 672870960
- config_name: subset_60
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 585974036
num_examples: 1874
download_size: 91164500
dataset_size: 585974036
- config_name: subset_61
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 601324649
num_examples: 1904
download_size: 93488312
dataset_size: 601324649
- config_name: subset_62
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 610232104
num_examples: 1887
download_size: 94858684
dataset_size: 610232104
- config_name: subset_63
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 594089650
num_examples: 1861
download_size: 92338161
dataset_size: 594089650
- config_name: subset_64
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 598507012
num_examples: 1893
download_size: 92926289
dataset_size: 598507012
- config_name: subset_65
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 581399725
num_examples: 1886
download_size: 90344432
dataset_size: 581399725
- config_name: subset_66
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 607177758
num_examples: 1918
download_size: 94313434
dataset_size: 607177758
- config_name: subset_67
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 617193940
num_examples: 1908
download_size: 95751208
dataset_size: 617193940
- config_name: subset_68
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 612310193
num_examples: 1939
download_size: 95085020
dataset_size: 612310193
- config_name: subset_69
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 624189811
num_examples: 1932
download_size: 97000822
dataset_size: 624189811
- config_name: subset_7
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 668366043
num_examples: 2085
download_size: 103327997
dataset_size: 668366043
- config_name: subset_70
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 605614079
num_examples: 1887
download_size: 94065558
dataset_size: 605614079
- config_name: subset_71
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 622290798
num_examples: 1932
download_size: 96691206
dataset_size: 622290798
- config_name: subset_72
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 621692027
num_examples: 1916
download_size: 96671064
dataset_size: 621692027
- config_name: subset_73
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 619917092
num_examples: 1900
download_size: 96297883
dataset_size: 619917092
- config_name: subset_74
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 600073193
num_examples: 1899
download_size: 93335587
dataset_size: 600073193
- config_name: subset_75
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 619899864
num_examples: 1927
download_size: 96380111
dataset_size: 619899864
- config_name: subset_76
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 617671464
num_examples: 1912
download_size: 95983755
dataset_size: 617671464
- config_name: subset_77
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 623196459
num_examples: 1922
download_size: 96840283
dataset_size: 623196459
- config_name: subset_78
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 604620060
num_examples: 1909
download_size: 93919810
dataset_size: 604620060
- config_name: subset_79
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 624889748
num_examples: 1948
download_size: 97108809
dataset_size: 624889748
- config_name: subset_8
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 640873033
num_examples: 2083
download_size: 99239901
dataset_size: 640873033
- config_name: subset_80
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 599687883
num_examples: 1877
download_size: 93117591
dataset_size: 599687883
- config_name: subset_81
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 616372137
num_examples: 1898
download_size: 95815971
dataset_size: 616372137
- config_name: subset_82
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 614709241
num_examples: 1896
download_size: 95558905
dataset_size: 614709241
- config_name: subset_83
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 604659746
num_examples: 1878
download_size: 93971802
dataset_size: 604659746
- config_name: subset_84
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 612349655
num_examples: 1902
download_size: 95089038
dataset_size: 612349655
- config_name: subset_85
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 635783420
num_examples: 1984
download_size: 98830150
dataset_size: 635783420
- config_name: subset_86
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 639714989
num_examples: 1930
download_size: 99427278
dataset_size: 639714989
- config_name: subset_87
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 596887316
num_examples: 1849
download_size: 92696815
dataset_size: 596887316
- config_name: subset_88
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: hiA.audio.tokens
sequence:
sequence: int64
- name: enA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 590147832
num_examples: 1849
download_size: 91715309
dataset_size: 590147832
- config_name: subset_89
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 619649388
num_examples: 1894
download_size: 96287177
dataset_size: 619649388
- config_name: subset_9
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 637310133
num_examples: 2022
download_size: 98772067
dataset_size: 637310133
- config_name: subset_90
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 600331500
num_examples: 1893
download_size: 93319826
dataset_size: 600331500
- config_name: subset_91
features:
- name: line_no
dtype: int64
- name: enA.id
dtype: string
- name: enA.laser_score
dtype: float64
- name: hiA.id
dtype: string
- name: hiA.laser_score
dtype: float64
- name: enA.audio.tokens
sequence:
sequence: int64
- name: hiA.audio.tokens
sequence:
sequence: int64
splits:
- name: train
num_bytes: 591726520
num_examples: 1820
download_size: 91957068
dataset_size: 591726520
configs:
- config_name: subset_1
data_files:
- split: train
path: subset_1/train-*
- config_name: subset_10
data_files:
- split: train
path: subset_10/train-*
- config_name: subset_11
data_files:
- split: train
path: subset_11/train-*
- config_name: subset_12
data_files:
- split: train
path: subset_12/train-*
- config_name: subset_13
data_files:
- split: train
path: subset_13/train-*
- config_name: subset_14
data_files:
- split: train
path: subset_14/train-*
- config_name: subset_15
data_files:
- split: train
path: subset_15/train-*
- config_name: subset_16
data_files:
- split: train
path: subset_16/train-*
- config_name: subset_17
data_files:
- split: train
path: subset_17/train-*
- config_name: subset_18
data_files:
- split: train
path: subset_18/train-*
- config_name: subset_19
data_files:
- split: train
path: subset_19/train-*
- config_name: subset_2
data_files:
- split: train
path: subset_2/train-*
- config_name: subset_20
data_files:
- split: train
path: subset_20/train-*
- config_name: subset_21
data_files:
- split: train
path: subset_21/train-*
- config_name: subset_22
data_files:
- split: train
path: subset_22/train-*
- config_name: subset_23
data_files:
- split: train
path: subset_23/train-*
- config_name: subset_24
data_files:
- split: train
path: subset_24/train-*
- config_name: subset_25
data_files:
- split: train
path: subset_25/train-*
- config_name: subset_26
data_files:
- split: train
path: subset_26/train-*
- config_name: subset_27
data_files:
- split: train
path: subset_27/train-*
- config_name: subset_28
data_files:
- split: train
path: subset_28/train-*
- config_name: subset_29
data_files:
- split: train
path: subset_29/train-*
- config_name: subset_3
data_files:
- split: train
path: subset_3/train-*
- config_name: subset_30
data_files:
- split: train
path: subset_30/train-*
- config_name: subset_31
data_files:
- split: train
path: subset_31/train-*
- config_name: subset_32
data_files:
- split: train
path: subset_32/train-*
- config_name: subset_33
data_files:
- split: train
path: subset_33/train-*
- config_name: subset_34
data_files:
- split: train
path: subset_34/train-*
- config_name: subset_35
data_files:
- split: train
path: subset_35/train-*
- config_name: subset_36
data_files:
- split: train
path: subset_36/train-*
- config_name: subset_37
data_files:
- split: train
path: subset_37/train-*
- config_name: subset_38
data_files:
- split: train
path: subset_38/train-*
- config_name: subset_39
data_files:
- split: train
path: subset_39/train-*
- config_name: subset_4
data_files:
- split: train
path: subset_4/train-*
- config_name: subset_40
data_files:
- split: train
path: subset_40/train-*
- config_name: subset_41
data_files:
- split: train
path: subset_41/train-*
- config_name: subset_42
data_files:
- split: train
path: subset_42/train-*
- config_name: subset_43
data_files:
- split: train
path: subset_43/train-*
- config_name: subset_44
data_files:
- split: train
path: subset_44/train-*
- config_name: subset_45
data_files:
- split: train
path: subset_45/train-*
- config_name: subset_46
data_files:
- split: train
path: subset_46/train-*
- config_name: subset_47
data_files:
- split: train
path: subset_47/train-*
- config_name: subset_48
data_files:
- split: train
path: subset_48/train-*
- config_name: subset_49
data_files:
- split: train
path: subset_49/train-*
- config_name: subset_5
data_files:
- split: train
path: subset_5/train-*
- config_name: subset_50
data_files:
- split: train
path: subset_50/train-*
- config_name: subset_51
data_files:
- split: train
path: subset_51/train-*
- config_name: subset_52
data_files:
- split: train
path: subset_52/train-*
- config_name: subset_53
data_files:
- split: train
path: subset_53/train-*
- config_name: subset_54
data_files:
- split: train
path: subset_54/train-*
- config_name: subset_55
data_files:
- split: train
path: subset_55/train-*
- config_name: subset_56
data_files:
- split: train
path: subset_56/train-*
- config_name: subset_57
data_files:
- split: train
path: subset_57/train-*
- config_name: subset_58
data_files:
- split: train
path: subset_58/train-*
- config_name: subset_59
data_files:
- split: train
path: subset_59/train-*
- config_name: subset_6
data_files:
- split: train
path: subset_6/train-*
- config_name: subset_60
data_files:
- split: train
path: subset_60/train-*
- config_name: subset_61
data_files:
- split: train
path: subset_61/train-*
- config_name: subset_62
data_files:
- split: train
path: subset_62/train-*
- config_name: subset_63
data_files:
- split: train
path: subset_63/train-*
- config_name: subset_64
data_files:
- split: train
path: subset_64/train-*
- config_name: subset_65
data_files:
- split: train
path: subset_65/train-*
- config_name: subset_66
data_files:
- split: train
path: subset_66/train-*
- config_name: subset_67
data_files:
- split: train
path: subset_67/train-*
- config_name: subset_68
data_files:
- split: train
path: subset_68/train-*
- config_name: subset_69
data_files:
- split: train
path: subset_69/train-*
- config_name: subset_7
data_files:
- split: train
path: subset_7/train-*
- config_name: subset_70
data_files:
- split: train
path: subset_70/train-*
- config_name: subset_71
data_files:
- split: train
path: subset_71/train-*
- config_name: subset_72
data_files:
- split: train
path: subset_72/train-*
- config_name: subset_73
data_files:
- split: train
path: subset_73/train-*
- config_name: subset_74
data_files:
- split: train
path: subset_74/train-*
- config_name: subset_75
data_files:
- split: train
path: subset_75/train-*
- config_name: subset_76
data_files:
- split: train
path: subset_76/train-*
- config_name: subset_77
data_files:
- split: train
path: subset_77/train-*
- config_name: subset_78
data_files:
- split: train
path: subset_78/train-*
- config_name: subset_79
data_files:
- split: train
path: subset_79/train-*
- config_name: subset_8
data_files:
- split: train
path: subset_8/train-*
- config_name: subset_80
data_files:
- split: train
path: subset_80/train-*
- config_name: subset_81
data_files:
- split: train
path: subset_81/train-*
- config_name: subset_82
data_files:
- split: train
path: subset_82/train-*
- config_name: subset_83
data_files:
- split: train
path: subset_83/train-*
- config_name: subset_84
data_files:
- split: train
path: subset_84/train-*
- config_name: subset_85
data_files:
- split: train
path: subset_85/train-*
- config_name: subset_86
data_files:
- split: train
path: subset_86/train-*
- config_name: subset_87
data_files:
- split: train
path: subset_87/train-*
- config_name: subset_88
data_files:
- split: train
path: subset_88/train-*
- config_name: subset_89
data_files:
- split: train
path: subset_89/train-*
- config_name: subset_9
data_files:
- split: train
path: subset_9/train-*
- config_name: subset_90
data_files:
- split: train
path: subset_90/train-*
- config_name: subset_91
data_files:
- split: train
path: subset_91/train-*
---
本数据集详情如下:
### 数据集整体结构
本数据集包含91个配置子集,编号为"subset_1"至"subset_91",所有配置子集的特征结构完全一致,仅训练拆分的字节数、样本数量、下载大小及数据集总大小存在差异。
### 单配置子集详情
每个配置子集包含以下特征字段:
1. `line_no`:行号,数据类型为64位整数(int64)
2. `enA.id`:英语语料标识符,数据类型为字符串(string)
3. `enA.laser_score`:英语语料激光相似度得分(LASER score),数据类型为64位浮点数(float64)
4. `hiA.id`:印地语语料标识符,数据类型为字符串(string)
5. `hiA.laser_score`:印地语语料激光相似度得分(LASER score),数据类型为64位浮点数(float64)
6. `enA.audio.tokens`:英语音频Token序列,为嵌套的二维64位整数序列
7. `hiA.audio.tokens`:印地语音频Token序列,为嵌套的二维64位整数序列
所有配置子集仅包含训练(train)拆分,各子集的训练拆分参数如下(以部分子集为例):
- "subset_1":训练拆分字节数859474522,样本数2295,下载大小131903597,数据集总大小859474522
- "subset_10":训练拆分字节数637582321,样本数2026,下载大小98842555,数据集总大小637582321
其余子集的具体参数请参照原始配置文件。
### 数据文件映射
所有配置子集均对应训练拆分的数据文件,文件路径格式为`subset_{编号}/train-*`。
提供机构:
asahi417
原始信息汇总
数据集概述
数据集配置信息
| 配置名称 | 特征数量 | 主要特征 | 数据类型 |
|---|---|---|---|
| subset_1 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_10 | 7 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_11 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_12 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_13 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_14 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_15 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_16 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_17 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_18 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_19 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_2 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_20 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_21 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_22 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_23 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_24 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_25 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_26 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_27 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_28 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_29 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_3 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_30 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_31 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_32 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_33 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_34 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_35 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_36 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_37 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_38 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_39 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_4 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_40 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_41 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_42 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_43 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_44 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_45 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_46 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
| subset_47 | 8 | line_no, enA.id, enA.laser_score, hiA.id, hiA.laser_score, enA.audio.tokens, hiA.audio.tokens | int64, string, float64, int64 |
数据集大小信息
| 配置名称 | 训练集大小 | 训练集示例数 | 下载大小 |
|---|---|---|---|
| subset_1 | 859474522 bytes | 2295 | 131903597 bytes |
| subset_10 | 637582321 bytes | 2026 | 98842555 bytes |
| subset_11 | 614079087 bytes | 1984 | 95072280 bytes |
| subset_12 | 620366288 bytes | 2004 | 96086284 bytes |
| subset_13 | 586303575 bytes | 1931 | 90799434 bytes |
| subset_14 | 613901849 bytes | 1980 | 95121567 bytes |
| subset_15 | 574423128 bytes | 1959 | 89117360 bytes |
| subset_16 | 596620190 bytes | 2001 | 92601152 bytes |
| subset_17 | 628994958 bytes | 2022 | 97646543 bytes |
| subset_18 | 595646848 bytes | 1988 | 92399494 bytes |
| subset_19 | 598336801 bytes | 1965 | 92871627 bytes |
| subset_2 | 857803928 bytes | 2335 | 131765202 bytes |
| subset_20 | 605729037 bytes | 1971 | 93811180 bytes |
| subset_21 | 588502178 bytes | 1976 | 91360603 bytes |
| subset_22 | 617611513 bytes | 2018 | 95808771 bytes |
| subset_23 | 591465283 bytes | 1981 | 91839994 bytes |
| subset_24 | 591406803 bytes | 1970 | 91935787 bytes |
| subset_25 | 588220650 bytes | 1971 | 91312572 bytes |
| subset_26 | 582406673 bytes | 1933 | 90433280 bytes |
| subset_27 | 585242164 bytes | 1956 | 90776094 bytes |
| subset_28 | 601211892 bytes | 1958 | 93294895 bytes |
| subset_29 | 598070460 bytes | 1948 | 92906700 bytes |
| subset_3 | 800322191 bytes | 2282 | 123119922 bytes |
| subset_30 | 581940007 bytes | 1934 | 90318683 bytes |
| subset_31 | 578517056 bytes | 1938 | 89679857 bytes |
| subset_32 | 596220667 bytes | 1964 | 92598685 bytes |
| subset_33 | 581990969 bytes | 1956 | 90471237 bytes |
| subset_34 | 566628571 bytes | 1870 | 87920092 bytes |
| subset_35 | 579084763 bytes | 1892 | 89934283 bytes |
| subset_36 | 598188341 bytes | 1912 | 92842406 bytes |
| subset_37 | 586972483 bytes | 1922 | 91070916 bytes |
| subset_38 | 573889300 bytes | 1887 | 89156393 bytes |
| subset_39 |



