ptrdvn/YALD_v0_raw
收藏Hugging Face2026-03-24 更新2026-03-29 收录
下载链接:
https://hf-mirror.com/datasets/ptrdvn/YALD_v0_raw
下载链接
链接失效反馈官方服务:
资源简介:
---
dataset_info:
- config_name: all_audio_ban
features:
- name: id
dtype: string
- name: audio
dtype: audio
splits:
- name: train
num_bytes: 27976323765
num_examples: 13513
download_size: 27871605210
dataset_size: 27976323765
- config_name: all_audio_cym
features:
- name: id
dtype: string
- name: audio
dtype: audio
splits:
- name: train
num_bytes: 2136430489
num_examples: 1406
download_size: 2107168049
dataset_size: 2136430489
- config_name: all_audio_gle
features:
- name: id
dtype: string
- name: audio
dtype: audio
splits:
- name: train
num_bytes: 3265050571
num_examples: 1666
download_size: 3358698808
dataset_size: 3265050571
- config_name: all_audio_uig
features:
- name: id
dtype: string
- name: audio
dtype: audio
splits:
- name: train
num_bytes: 10999210562
num_examples: 3056
download_size: 11010287221
dataset_size: 10999210562
- config_name: all_audio_xho
features:
- name: id
dtype: string
- name: audio
dtype: audio
splits:
- name: train
num_bytes: 4380535689
num_examples: 2039
download_size: 4303320322
dataset_size: 4380535689
- config_name: all_segments_ban
features:
- name: id
dtype: large_string
- name: index
dtype: int64
- name: type
dtype: large_string
- name: ie_key
dtype: large_string
- name: url
dtype: large_string
- name: title
dtype: large_string
- name: description
dtype: large_string
- name: duration
dtype: float64
- name: channel_id
dtype: large_string
- name: channel
dtype: large_string
- name: channel_url
dtype: large_string
- name: uploader
dtype: large_string
- name: uploader_id
dtype: large_string
- name: uploader_url
dtype: large_string
- name: thumbnails
list:
- name: height
dtype: int64
- name: url
dtype: string
- name: width
dtype: int64
- name: timestamp
dtype: float64
- name: release_timestamp
dtype: float64
- name: availability
dtype: float64
- name: view_count
dtype: float64
- name: live_status
dtype: large_string
- name: channel_is_verified
dtype: bool
- name: x_forwarded_for_ip
dtype: 'null'
- name: search_keyword
dtype: large_string
- name: is_cc
dtype: bool
- name: concurrent_view_count
dtype: float64
- name: channel_follower_count
dtype: float64
- name: playlist_count
dtype: float64
- name: title_desc_langs
list: string
- name: timestamps
list:
- name: end
dtype: float64
- name: start
dtype: float64
- name: merged_timestamps
list:
- name: end
dtype: float64
- name: start
dtype: float64
- name: split_timestamps
list:
- name: clip_id
dtype: int64
- name: end
dtype: float64
- name: start
dtype: float64
- name: transcript_omniASR_CTC_1B_v2
dtype: string
- name: transcript_omniASR_CTC_300M_v2
dtype: string
- name: transcript_omniASR_CTC_3B_v2
dtype: string
- name: total_speech_time
dtype: float64
- name: perc_speech_time
dtype: float64
splits:
- name: train
num_bytes: 341426050
num_examples: 239300
download_size: 229588857
dataset_size: 341426050
- config_name: all_segments_cym
features:
- name: id
dtype: large_string
- name: index
dtype: int64
- name: type
dtype: large_string
- name: ie_key
dtype: large_string
- name: url
dtype: large_string
- name: title
dtype: large_string
- name: description
dtype: large_string
- name: duration
dtype: float64
- name: channel_id
dtype: large_string
- name: channel
dtype: large_string
- name: channel_url
dtype: large_string
- name: uploader
dtype: large_string
- name: uploader_id
dtype: large_string
- name: uploader_url
dtype: large_string
- name: thumbnails
list:
- name: height
dtype: int64
- name: url
dtype: string
- name: width
dtype: int64
- name: timestamp
dtype: float64
- name: release_timestamp
dtype: float64
- name: availability
dtype: float64
- name: view_count
dtype: float64
- name: live_status
dtype: large_string
- name: channel_is_verified
dtype: bool
- name: x_forwarded_for_ip
dtype: 'null'
- name: search_keyword
dtype: large_string
- name: is_cc
dtype: bool
- name: channel_follower_count
dtype: float64
- name: playlist_count
dtype: float64
- name: concurrent_view_count
dtype: float64
- name: title_desc_langs
list: string
- name: timestamps
list:
- name: end
dtype: float64
- name: start
dtype: float64
- name: merged_timestamps
list:
- name: end
dtype: float64
- name: start
dtype: float64
- name: split_timestamps
list:
- name: clip_id
dtype: int64
- name: end
dtype: float64
- name: start
dtype: float64
- name: transcript_omniASR_CTC_1B_v2
dtype: string
- name: transcript_omniASR_CTC_300M_v2
dtype: string
- name: transcript_omniASR_CTC_3B_v2
dtype: string
- name: transcript_omniASR_CTC_7B_v2
dtype: string
- name: total_speech_time
dtype: float64
- name: perc_speech_time
dtype: float64
splits:
- name: train
num_bytes: 52491482
num_examples: 33576
download_size: 36623217
dataset_size: 52491482
- config_name: all_segments_gle
features:
- name: id
dtype: large_string
- name: index
dtype: int64
- name: type
dtype: large_string
- name: ie_key
dtype: large_string
- name: url
dtype: large_string
- name: title
dtype: large_string
- name: description
dtype: large_string
- name: duration
dtype: float64
- name: channel_id
dtype: large_string
- name: channel
dtype: large_string
- name: channel_url
dtype: large_string
- name: uploader
dtype: large_string
- name: uploader_id
dtype: large_string
- name: uploader_url
dtype: large_string
- name: thumbnails
list:
- name: height
dtype: int64
- name: url
dtype: string
- name: width
dtype: int64
- name: timestamp
dtype: float64
- name: release_timestamp
dtype: float64
- name: availability
dtype: float64
- name: view_count
dtype: float64
- name: live_status
dtype: large_string
- name: channel_is_verified
dtype: bool
- name: x_forwarded_for_ip
dtype: 'null'
- name: search_keyword
dtype: large_string
- name: is_cc
dtype: bool
- name: channel_follower_count
dtype: float64
- name: playlist_count
dtype: float64
- name: concurrent_view_count
dtype: float64
- name: title_desc_langs
list: string
- name: timestamps
list:
- name: end
dtype: float64
- name: start
dtype: float64
- name: merged_timestamps
list:
- name: end
dtype: float64
- name: start
dtype: float64
- name: split_timestamps
list:
- name: clip_id
dtype: int64
- name: end
dtype: float64
- name: start
dtype: float64
- name: transcript_omniASR_CTC_1B_v2
dtype: string
- name: transcript_omniASR_CTC_300M_v2
dtype: string
- name: transcript_omniASR_CTC_3B_v2
dtype: string
- name: transcript_omniASR_CTC_7B_v2
dtype: string
- name: total_speech_time
dtype: float64
- name: perc_speech_time
dtype: float64
splits:
- name: train
num_bytes: 117556177
num_examples: 79289
download_size: 79276714
dataset_size: 117556177
- config_name: all_segments_uig
features:
- name: id
dtype: large_string
- name: index
dtype: int64
- name: type
dtype: large_string
- name: ie_key
dtype: large_string
- name: url
dtype: large_string
- name: title
dtype: large_string
- name: description
dtype: large_string
- name: duration
dtype: float64
- name: channel_id
dtype: large_string
- name: channel
dtype: large_string
- name: channel_url
dtype: large_string
- name: uploader
dtype: large_string
- name: uploader_id
dtype: large_string
- name: uploader_url
dtype: large_string
- name: thumbnails
list:
- name: height
dtype: int64
- name: url
dtype: string
- name: width
dtype: int64
- name: timestamp
dtype: 'null'
- name: release_timestamp
dtype: float64
- name: availability
dtype: 'null'
- name: view_count
dtype: float64
- name: live_status
dtype: large_string
- name: channel_is_verified
dtype: bool
- name: x_forwarded_for_ip
dtype: 'null'
- name: search_keyword
dtype: large_string
- name: is_cc
dtype: bool
- name: concurrent_view_count
dtype: float64
- name: title_desc_langs
list: string
- name: timestamps
list:
- name: end
dtype: float64
- name: start
dtype: float64
- name: merged_timestamps
list:
- name: end
dtype: float64
- name: start
dtype: float64
- name: split_timestamps
list:
- name: clip_id
dtype: int64
- name: end
dtype: float64
- name: start
dtype: float64
- name: transcript_omniASR_CTC_1B_v2
dtype: string
- name: transcript_omniASR_CTC_300M_v2
dtype: string
- name: transcript_omniASR_CTC_3B_v2
dtype: string
- name: transcript_omniASR_CTC_7B_v2
dtype: string
- name: total_speech_time
dtype: float64
- name: perc_speech_time
dtype: float64
splits:
- name: train
num_bytes: 293639069
num_examples: 11546
download_size: 288317364
dataset_size: 293639069
- config_name: all_segments_xho
features:
- name: id
dtype: large_string
- name: index
dtype: int64
- name: type
dtype: large_string
- name: ie_key
dtype: large_string
- name: url
dtype: large_string
- name: title
dtype: large_string
- name: description
dtype: large_string
- name: duration
dtype: float64
- name: channel_id
dtype: large_string
- name: channel
dtype: large_string
- name: channel_url
dtype: large_string
- name: uploader
dtype: large_string
- name: uploader_id
dtype: large_string
- name: uploader_url
dtype: large_string
- name: thumbnails
list:
- name: height
dtype: int64
- name: url
dtype: string
- name: width
dtype: int64
- name: timestamp
dtype: float64
- name: release_timestamp
dtype: float64
- name: availability
dtype: float64
- name: view_count
dtype: float64
- name: live_status
dtype: large_string
- name: channel_is_verified
dtype: bool
- name: x_forwarded_for_ip
dtype: 'null'
- name: search_keyword
dtype: large_string
- name: is_cc
dtype: bool
- name: channel_follower_count
dtype: float64
- name: playlist_count
dtype: float64
- name: title_desc_langs
list: string
- name: concurrent_view_count
dtype: float64
- name: timestamps
list:
- name: end
dtype: float64
- name: start
dtype: float64
- name: merged_timestamps
list:
- name: end
dtype: float64
- name: start
dtype: float64
- name: split_timestamps
list:
- name: clip_id
dtype: int64
- name: end
dtype: float64
- name: start
dtype: float64
- name: transcript_omniASR_CTC_1B_v2
dtype: string
- name: transcript_omniASR_CTC_300M_v2
dtype: string
- name: transcript_omniASR_CTC_3B_v2
dtype: string
- name: transcript_omniASR_CTC_7B_v2
dtype: string
- name: total_speech_time
dtype: float64
- name: perc_speech_time
dtype: float64
splits:
- name: train
num_bytes: 96178037
num_examples: 42661
download_size: 75216325
dataset_size: 96178037
- config_name: keywords_ban
features:
- name: keyword
dtype: large_string
- name: lang_name
dtype: large_string
splits:
- name: train
num_bytes: 26872
num_examples: 541
download_size: 10220
dataset_size: 26872
- config_name: keywords_cym
features:
- name: keyword
dtype: large_string
- name: lang_name
dtype: large_string
splits:
- name: train
num_bytes: 22601
num_examples: 529
download_size: 9674
dataset_size: 22601
- config_name: keywords_gle
features:
- name: keyword
dtype: large_string
- name: lang_name
dtype: large_string
splits:
- name: train
num_bytes: 26244
num_examples: 529
download_size: 11507
dataset_size: 26244
- config_name: keywords_uig
features:
- name: keyword
dtype: large_string
- name: lang_name
dtype: large_string
splits:
- name: train
num_bytes: 32291
num_examples: 539
download_size: 12279
dataset_size: 32291
- config_name: keywords_xho
features:
- name: keyword
dtype: large_string
- name: lang_name
dtype: large_string
splits:
- name: train
num_bytes: 24148
num_examples: 530
download_size: 9648
dataset_size: 24148
- config_name: search_results_ban
features:
- name: type
dtype: large_string
- name: ie_key
dtype: large_string
- name: id
dtype: large_string
- name: url
dtype: large_string
- name: title
dtype: large_string
- name: description
dtype: large_string
- name: duration
dtype: float64
- name: channel_id
dtype: large_string
- name: channel
dtype: large_string
- name: channel_url
dtype: large_string
- name: uploader
dtype: large_string
- name: uploader_id
dtype: large_string
- name: uploader_url
dtype: large_string
- name: thumbnails
list:
- name: height
dtype: int64
- name: url
dtype: string
- name: width
dtype: int64
- name: timestamp
dtype: float64
- name: release_timestamp
dtype: float64
- name: availability
dtype: float64
- name: view_count
dtype: float64
- name: live_status
dtype: large_string
- name: channel_is_verified
dtype: bool
- name: x_forwarded_for_ip
dtype: 'null'
- name: search_keyword
dtype: large_string
- name: is_cc
dtype: bool
- name: concurrent_view_count
dtype: float64
- name: channel_follower_count
dtype: float64
- name: playlist_count
dtype: float64
- name: title_desc_langs
list: string
- name: __index_level_0__
dtype: int64
splits:
- name: train
num_bytes: 222078685
num_examples: 239300
download_size: 115998250
dataset_size: 222078685
- config_name: search_results_cym
features:
- name: type
dtype: large_string
- name: ie_key
dtype: large_string
- name: id
dtype: large_string
- name: url
dtype: large_string
- name: title
dtype: large_string
- name: description
dtype: large_string
- name: duration
dtype: float64
- name: channel_id
dtype: large_string
- name: channel
dtype: large_string
- name: channel_url
dtype: large_string
- name: uploader
dtype: large_string
- name: uploader_id
dtype: large_string
- name: uploader_url
dtype: large_string
- name: thumbnails
list:
- name: height
dtype: int64
- name: url
dtype: string
- name: width
dtype: int64
- name: timestamp
dtype: float64
- name: release_timestamp
dtype: float64
- name: availability
dtype: float64
- name: view_count
dtype: float64
- name: live_status
dtype: large_string
- name: channel_is_verified
dtype: bool
- name: x_forwarded_for_ip
dtype: 'null'
- name: search_keyword
dtype: large_string
- name: is_cc
dtype: bool
- name: channel_follower_count
dtype: float64
- name: playlist_count
dtype: float64
- name: concurrent_view_count
dtype: float64
- name: title_desc_langs
list: string
- name: __index_level_0__
dtype: int64
splits:
- name: train
num_bytes: 32713600
num_examples: 33576
download_size: 17715283
dataset_size: 32713600
- config_name: search_results_gle
features:
- name: type
dtype: large_string
- name: ie_key
dtype: large_string
- name: id
dtype: large_string
- name: url
dtype: large_string
- name: title
dtype: large_string
- name: description
dtype: large_string
- name: duration
dtype: float64
- name: channel_id
dtype: large_string
- name: channel
dtype: large_string
- name: channel_url
dtype: large_string
- name: uploader
dtype: large_string
- name: uploader_id
dtype: large_string
- name: uploader_url
dtype: large_string
- name: thumbnails
list:
- name: height
dtype: int64
- name: url
dtype: string
- name: width
dtype: int64
- name: timestamp
dtype: float64
- name: release_timestamp
dtype: float64
- name: availability
dtype: float64
- name: view_count
dtype: float64
- name: live_status
dtype: large_string
- name: channel_is_verified
dtype: bool
- name: x_forwarded_for_ip
dtype: 'null'
- name: search_keyword
dtype: large_string
- name: is_cc
dtype: bool
- name: channel_follower_count
dtype: float64
- name: playlist_count
dtype: float64
- name: concurrent_view_count
dtype: float64
- name: title_desc_langs
list: string
- name: __index_level_0__
dtype: int64
splits:
- name: train
num_bytes: 76016419
num_examples: 79289
download_size: 39793278
dataset_size: 76016419
- config_name: search_results_gug
features:
- name: type
dtype: large_string
- name: ie_key
dtype: large_string
- name: id
dtype: large_string
- name: url
dtype: large_string
- name: title
dtype: large_string
- name: description
dtype: large_string
- name: duration
dtype: float64
- name: channel_id
dtype: large_string
- name: channel
dtype: large_string
- name: channel_url
dtype: large_string
- name: uploader
dtype: large_string
- name: uploader_id
dtype: large_string
- name: uploader_url
dtype: large_string
- name: thumbnails
list:
- name: height
dtype: int64
- name: url
dtype: string
- name: width
dtype: int64
- name: timestamp
dtype: float64
- name: release_timestamp
dtype: float64
- name: availability
dtype: float64
- name: view_count
dtype: float64
- name: live_status
dtype: large_string
- name: channel_is_verified
dtype: bool
- name: x_forwarded_for_ip
dtype: 'null'
- name: search_keyword
dtype: large_string
- name: is_cc
dtype: bool
- name: channel_follower_count
dtype: float64
- name: playlist_count
dtype: float64
- name: concurrent_view_count
dtype: float64
- name: title_desc_langs
list: string
- name: __index_level_0__
dtype: int64
splits:
- name: train
num_bytes: 220241040
num_examples: 233362
download_size: 111234627
dataset_size: 220241040
- config_name: search_results_uig
features:
- name: type
dtype: large_string
- name: ie_key
dtype: large_string
- name: id
dtype: large_string
- name: url
dtype: large_string
- name: title
dtype: large_string
- name: description
dtype: large_string
- name: duration
dtype: float64
- name: channel_id
dtype: large_string
- name: channel
dtype: large_string
- name: channel_url
dtype: large_string
- name: uploader
dtype: large_string
- name: uploader_id
dtype: large_string
- name: uploader_url
dtype: large_string
- name: thumbnails
list:
- name: height
dtype: int64
- name: url
dtype: string
- name: width
dtype: int64
- name: timestamp
dtype: 'null'
- name: release_timestamp
dtype: float64
- name: availability
dtype: 'null'
- name: view_count
dtype: float64
- name: live_status
dtype: large_string
- name: channel_is_verified
dtype: bool
- name: x_forwarded_for_ip
dtype: 'null'
- name: search_keyword
dtype: large_string
- name: is_cc
dtype: bool
- name: concurrent_view_count
dtype: float64
- name: title_desc_langs
list: string
- name: __index_level_0__
dtype: int64
splits:
- name: train
num_bytes: 11434074
num_examples: 11546
download_size: 5904690
dataset_size: 11434074
- config_name: search_results_xho
features:
- name: type
dtype: large_string
- name: ie_key
dtype: large_string
- name: id
dtype: large_string
- name: url
dtype: large_string
- name: title
dtype: large_string
- name: description
dtype: large_string
- name: duration
dtype: float64
- name: channel_id
dtype: large_string
- name: channel
dtype: large_string
- name: channel_url
dtype: large_string
- name: uploader
dtype: large_string
- name: uploader_id
dtype: large_string
- name: uploader_url
dtype: large_string
- name: thumbnails
list:
- name: height
dtype: int64
- name: url
dtype: string
- name: width
dtype: int64
- name: timestamp
dtype: float64
- name: release_timestamp
dtype: float64
- name: availability
dtype: float64
- name: view_count
dtype: float64
- name: live_status
dtype: large_string
- name: channel_is_verified
dtype: bool
- name: x_forwarded_for_ip
dtype: 'null'
- name: search_keyword
dtype: large_string
- name: is_cc
dtype: bool
- name: channel_follower_count
dtype: float64
- name: playlist_count
dtype: float64
- name: title_desc_langs
list: string
- name: concurrent_view_count
dtype: float64
- name: __index_level_0__
dtype: int64
splits:
- name: train
num_bytes: 41159556
num_examples: 42661
download_size: 21233527
dataset_size: 41159556
configs:
- config_name: all_audio_ban
data_files:
- split: train
path: all_audio_ban/train-*
- config_name: all_audio_cym
data_files:
- split: train
path: all_audio_cym/train-*
- config_name: all_audio_gle
data_files:
- split: train
path: all_audio_gle/train-*
- config_name: all_audio_uig
data_files:
- split: train
path: all_audio_uig/train-*
- config_name: all_audio_xho
data_files:
- split: train
path: all_audio_xho/train-*
- config_name: all_segments_ban
data_files:
- split: train
path: all_segments_ban/train-*
- config_name: all_segments_cym
data_files:
- split: train
path: all_segments_cym/train-*
- config_name: all_segments_gle
data_files:
- split: train
path: all_segments_gle/train-*
- config_name: all_segments_uig
data_files:
- split: train
path: all_segments_uig/train-*
- config_name: all_segments_xho
data_files:
- split: train
path: all_segments_xho/train-*
- config_name: keywords_ban
data_files:
- split: train
path: keywords_ban/train-*
- config_name: keywords_cym
data_files:
- split: train
path: keywords_cym/train-*
- config_name: keywords_gle
data_files:
- split: train
path: keywords_gle/train-*
- config_name: keywords_uig
data_files:
- split: train
path: keywords_uig/train-*
- config_name: keywords_xho
data_files:
- split: train
path: keywords_xho/train-*
- config_name: search_results_ban
data_files:
- split: train
path: search_results_ban/train-*
- config_name: search_results_cym
data_files:
- split: train
path: search_results_cym/train-*
- config_name: search_results_gle
data_files:
- split: train
path: search_results_gle/train-*
- config_name: search_results_gug
data_files:
- split: train
path: search_results_gug/train-*
- config_name: search_results_uig
data_files:
- split: train
path: search_results_uig/train-*
- config_name: search_results_xho
data_files:
- split: train
path: search_results_xho/train-*
---
提供机构:
ptrdvn



