five

ptrdvn/YALD_v0_raw

收藏
Hugging Face2026-03-24 更新2026-03-29 收录
下载链接:
https://hf-mirror.com/datasets/ptrdvn/YALD_v0_raw
下载链接
链接失效反馈
官方服务:
资源简介:
--- dataset_info: - config_name: all_audio_ban features: - name: id dtype: string - name: audio dtype: audio splits: - name: train num_bytes: 27976323765 num_examples: 13513 download_size: 27871605210 dataset_size: 27976323765 - config_name: all_audio_cym features: - name: id dtype: string - name: audio dtype: audio splits: - name: train num_bytes: 2136430489 num_examples: 1406 download_size: 2107168049 dataset_size: 2136430489 - config_name: all_audio_gle features: - name: id dtype: string - name: audio dtype: audio splits: - name: train num_bytes: 3265050571 num_examples: 1666 download_size: 3358698808 dataset_size: 3265050571 - config_name: all_audio_uig features: - name: id dtype: string - name: audio dtype: audio splits: - name: train num_bytes: 10999210562 num_examples: 3056 download_size: 11010287221 dataset_size: 10999210562 - config_name: all_audio_xho features: - name: id dtype: string - name: audio dtype: audio splits: - name: train num_bytes: 4380535689 num_examples: 2039 download_size: 4303320322 dataset_size: 4380535689 - config_name: all_segments_ban features: - name: id dtype: large_string - name: index dtype: int64 - name: type dtype: large_string - name: ie_key dtype: large_string - name: url dtype: large_string - name: title dtype: large_string - name: description dtype: large_string - name: duration dtype: float64 - name: channel_id dtype: large_string - name: channel dtype: large_string - name: channel_url dtype: large_string - name: uploader dtype: large_string - name: uploader_id dtype: large_string - name: uploader_url dtype: large_string - name: thumbnails list: - name: height dtype: int64 - name: url dtype: string - name: width dtype: int64 - name: timestamp dtype: float64 - name: release_timestamp dtype: float64 - name: availability dtype: float64 - name: view_count dtype: float64 - name: live_status dtype: large_string - name: channel_is_verified dtype: bool - name: x_forwarded_for_ip dtype: 'null' - name: search_keyword dtype: large_string - name: is_cc dtype: bool - name: concurrent_view_count dtype: float64 - name: channel_follower_count dtype: float64 - name: playlist_count dtype: float64 - name: title_desc_langs list: string - name: timestamps list: - name: end dtype: float64 - name: start dtype: float64 - name: merged_timestamps list: - name: end dtype: float64 - name: start dtype: float64 - name: split_timestamps list: - name: clip_id dtype: int64 - name: end dtype: float64 - name: start dtype: float64 - name: transcript_omniASR_CTC_1B_v2 dtype: string - name: transcript_omniASR_CTC_300M_v2 dtype: string - name: transcript_omniASR_CTC_3B_v2 dtype: string - name: total_speech_time dtype: float64 - name: perc_speech_time dtype: float64 splits: - name: train num_bytes: 341426050 num_examples: 239300 download_size: 229588857 dataset_size: 341426050 - config_name: all_segments_cym features: - name: id dtype: large_string - name: index dtype: int64 - name: type dtype: large_string - name: ie_key dtype: large_string - name: url dtype: large_string - name: title dtype: large_string - name: description dtype: large_string - name: duration dtype: float64 - name: channel_id dtype: large_string - name: channel dtype: large_string - name: channel_url dtype: large_string - name: uploader dtype: large_string - name: uploader_id dtype: large_string - name: uploader_url dtype: large_string - name: thumbnails list: - name: height dtype: int64 - name: url dtype: string - name: width dtype: int64 - name: timestamp dtype: float64 - name: release_timestamp dtype: float64 - name: availability dtype: float64 - name: view_count dtype: float64 - name: live_status dtype: large_string - name: channel_is_verified dtype: bool - name: x_forwarded_for_ip dtype: 'null' - name: search_keyword dtype: large_string - name: is_cc dtype: bool - name: channel_follower_count dtype: float64 - name: playlist_count dtype: float64 - name: concurrent_view_count dtype: float64 - name: title_desc_langs list: string - name: timestamps list: - name: end dtype: float64 - name: start dtype: float64 - name: merged_timestamps list: - name: end dtype: float64 - name: start dtype: float64 - name: split_timestamps list: - name: clip_id dtype: int64 - name: end dtype: float64 - name: start dtype: float64 - name: transcript_omniASR_CTC_1B_v2 dtype: string - name: transcript_omniASR_CTC_300M_v2 dtype: string - name: transcript_omniASR_CTC_3B_v2 dtype: string - name: transcript_omniASR_CTC_7B_v2 dtype: string - name: total_speech_time dtype: float64 - name: perc_speech_time dtype: float64 splits: - name: train num_bytes: 52491482 num_examples: 33576 download_size: 36623217 dataset_size: 52491482 - config_name: all_segments_gle features: - name: id dtype: large_string - name: index dtype: int64 - name: type dtype: large_string - name: ie_key dtype: large_string - name: url dtype: large_string - name: title dtype: large_string - name: description dtype: large_string - name: duration dtype: float64 - name: channel_id dtype: large_string - name: channel dtype: large_string - name: channel_url dtype: large_string - name: uploader dtype: large_string - name: uploader_id dtype: large_string - name: uploader_url dtype: large_string - name: thumbnails list: - name: height dtype: int64 - name: url dtype: string - name: width dtype: int64 - name: timestamp dtype: float64 - name: release_timestamp dtype: float64 - name: availability dtype: float64 - name: view_count dtype: float64 - name: live_status dtype: large_string - name: channel_is_verified dtype: bool - name: x_forwarded_for_ip dtype: 'null' - name: search_keyword dtype: large_string - name: is_cc dtype: bool - name: channel_follower_count dtype: float64 - name: playlist_count dtype: float64 - name: concurrent_view_count dtype: float64 - name: title_desc_langs list: string - name: timestamps list: - name: end dtype: float64 - name: start dtype: float64 - name: merged_timestamps list: - name: end dtype: float64 - name: start dtype: float64 - name: split_timestamps list: - name: clip_id dtype: int64 - name: end dtype: float64 - name: start dtype: float64 - name: transcript_omniASR_CTC_1B_v2 dtype: string - name: transcript_omniASR_CTC_300M_v2 dtype: string - name: transcript_omniASR_CTC_3B_v2 dtype: string - name: transcript_omniASR_CTC_7B_v2 dtype: string - name: total_speech_time dtype: float64 - name: perc_speech_time dtype: float64 splits: - name: train num_bytes: 117556177 num_examples: 79289 download_size: 79276714 dataset_size: 117556177 - config_name: all_segments_uig features: - name: id dtype: large_string - name: index dtype: int64 - name: type dtype: large_string - name: ie_key dtype: large_string - name: url dtype: large_string - name: title dtype: large_string - name: description dtype: large_string - name: duration dtype: float64 - name: channel_id dtype: large_string - name: channel dtype: large_string - name: channel_url dtype: large_string - name: uploader dtype: large_string - name: uploader_id dtype: large_string - name: uploader_url dtype: large_string - name: thumbnails list: - name: height dtype: int64 - name: url dtype: string - name: width dtype: int64 - name: timestamp dtype: 'null' - name: release_timestamp dtype: float64 - name: availability dtype: 'null' - name: view_count dtype: float64 - name: live_status dtype: large_string - name: channel_is_verified dtype: bool - name: x_forwarded_for_ip dtype: 'null' - name: search_keyword dtype: large_string - name: is_cc dtype: bool - name: concurrent_view_count dtype: float64 - name: title_desc_langs list: string - name: timestamps list: - name: end dtype: float64 - name: start dtype: float64 - name: merged_timestamps list: - name: end dtype: float64 - name: start dtype: float64 - name: split_timestamps list: - name: clip_id dtype: int64 - name: end dtype: float64 - name: start dtype: float64 - name: transcript_omniASR_CTC_1B_v2 dtype: string - name: transcript_omniASR_CTC_300M_v2 dtype: string - name: transcript_omniASR_CTC_3B_v2 dtype: string - name: transcript_omniASR_CTC_7B_v2 dtype: string - name: total_speech_time dtype: float64 - name: perc_speech_time dtype: float64 splits: - name: train num_bytes: 293639069 num_examples: 11546 download_size: 288317364 dataset_size: 293639069 - config_name: all_segments_xho features: - name: id dtype: large_string - name: index dtype: int64 - name: type dtype: large_string - name: ie_key dtype: large_string - name: url dtype: large_string - name: title dtype: large_string - name: description dtype: large_string - name: duration dtype: float64 - name: channel_id dtype: large_string - name: channel dtype: large_string - name: channel_url dtype: large_string - name: uploader dtype: large_string - name: uploader_id dtype: large_string - name: uploader_url dtype: large_string - name: thumbnails list: - name: height dtype: int64 - name: url dtype: string - name: width dtype: int64 - name: timestamp dtype: float64 - name: release_timestamp dtype: float64 - name: availability dtype: float64 - name: view_count dtype: float64 - name: live_status dtype: large_string - name: channel_is_verified dtype: bool - name: x_forwarded_for_ip dtype: 'null' - name: search_keyword dtype: large_string - name: is_cc dtype: bool - name: channel_follower_count dtype: float64 - name: playlist_count dtype: float64 - name: title_desc_langs list: string - name: concurrent_view_count dtype: float64 - name: timestamps list: - name: end dtype: float64 - name: start dtype: float64 - name: merged_timestamps list: - name: end dtype: float64 - name: start dtype: float64 - name: split_timestamps list: - name: clip_id dtype: int64 - name: end dtype: float64 - name: start dtype: float64 - name: transcript_omniASR_CTC_1B_v2 dtype: string - name: transcript_omniASR_CTC_300M_v2 dtype: string - name: transcript_omniASR_CTC_3B_v2 dtype: string - name: transcript_omniASR_CTC_7B_v2 dtype: string - name: total_speech_time dtype: float64 - name: perc_speech_time dtype: float64 splits: - name: train num_bytes: 96178037 num_examples: 42661 download_size: 75216325 dataset_size: 96178037 - config_name: keywords_ban features: - name: keyword dtype: large_string - name: lang_name dtype: large_string splits: - name: train num_bytes: 26872 num_examples: 541 download_size: 10220 dataset_size: 26872 - config_name: keywords_cym features: - name: keyword dtype: large_string - name: lang_name dtype: large_string splits: - name: train num_bytes: 22601 num_examples: 529 download_size: 9674 dataset_size: 22601 - config_name: keywords_gle features: - name: keyword dtype: large_string - name: lang_name dtype: large_string splits: - name: train num_bytes: 26244 num_examples: 529 download_size: 11507 dataset_size: 26244 - config_name: keywords_uig features: - name: keyword dtype: large_string - name: lang_name dtype: large_string splits: - name: train num_bytes: 32291 num_examples: 539 download_size: 12279 dataset_size: 32291 - config_name: keywords_xho features: - name: keyword dtype: large_string - name: lang_name dtype: large_string splits: - name: train num_bytes: 24148 num_examples: 530 download_size: 9648 dataset_size: 24148 - config_name: search_results_ban features: - name: type dtype: large_string - name: ie_key dtype: large_string - name: id dtype: large_string - name: url dtype: large_string - name: title dtype: large_string - name: description dtype: large_string - name: duration dtype: float64 - name: channel_id dtype: large_string - name: channel dtype: large_string - name: channel_url dtype: large_string - name: uploader dtype: large_string - name: uploader_id dtype: large_string - name: uploader_url dtype: large_string - name: thumbnails list: - name: height dtype: int64 - name: url dtype: string - name: width dtype: int64 - name: timestamp dtype: float64 - name: release_timestamp dtype: float64 - name: availability dtype: float64 - name: view_count dtype: float64 - name: live_status dtype: large_string - name: channel_is_verified dtype: bool - name: x_forwarded_for_ip dtype: 'null' - name: search_keyword dtype: large_string - name: is_cc dtype: bool - name: concurrent_view_count dtype: float64 - name: channel_follower_count dtype: float64 - name: playlist_count dtype: float64 - name: title_desc_langs list: string - name: __index_level_0__ dtype: int64 splits: - name: train num_bytes: 222078685 num_examples: 239300 download_size: 115998250 dataset_size: 222078685 - config_name: search_results_cym features: - name: type dtype: large_string - name: ie_key dtype: large_string - name: id dtype: large_string - name: url dtype: large_string - name: title dtype: large_string - name: description dtype: large_string - name: duration dtype: float64 - name: channel_id dtype: large_string - name: channel dtype: large_string - name: channel_url dtype: large_string - name: uploader dtype: large_string - name: uploader_id dtype: large_string - name: uploader_url dtype: large_string - name: thumbnails list: - name: height dtype: int64 - name: url dtype: string - name: width dtype: int64 - name: timestamp dtype: float64 - name: release_timestamp dtype: float64 - name: availability dtype: float64 - name: view_count dtype: float64 - name: live_status dtype: large_string - name: channel_is_verified dtype: bool - name: x_forwarded_for_ip dtype: 'null' - name: search_keyword dtype: large_string - name: is_cc dtype: bool - name: channel_follower_count dtype: float64 - name: playlist_count dtype: float64 - name: concurrent_view_count dtype: float64 - name: title_desc_langs list: string - name: __index_level_0__ dtype: int64 splits: - name: train num_bytes: 32713600 num_examples: 33576 download_size: 17715283 dataset_size: 32713600 - config_name: search_results_gle features: - name: type dtype: large_string - name: ie_key dtype: large_string - name: id dtype: large_string - name: url dtype: large_string - name: title dtype: large_string - name: description dtype: large_string - name: duration dtype: float64 - name: channel_id dtype: large_string - name: channel dtype: large_string - name: channel_url dtype: large_string - name: uploader dtype: large_string - name: uploader_id dtype: large_string - name: uploader_url dtype: large_string - name: thumbnails list: - name: height dtype: int64 - name: url dtype: string - name: width dtype: int64 - name: timestamp dtype: float64 - name: release_timestamp dtype: float64 - name: availability dtype: float64 - name: view_count dtype: float64 - name: live_status dtype: large_string - name: channel_is_verified dtype: bool - name: x_forwarded_for_ip dtype: 'null' - name: search_keyword dtype: large_string - name: is_cc dtype: bool - name: channel_follower_count dtype: float64 - name: playlist_count dtype: float64 - name: concurrent_view_count dtype: float64 - name: title_desc_langs list: string - name: __index_level_0__ dtype: int64 splits: - name: train num_bytes: 76016419 num_examples: 79289 download_size: 39793278 dataset_size: 76016419 - config_name: search_results_gug features: - name: type dtype: large_string - name: ie_key dtype: large_string - name: id dtype: large_string - name: url dtype: large_string - name: title dtype: large_string - name: description dtype: large_string - name: duration dtype: float64 - name: channel_id dtype: large_string - name: channel dtype: large_string - name: channel_url dtype: large_string - name: uploader dtype: large_string - name: uploader_id dtype: large_string - name: uploader_url dtype: large_string - name: thumbnails list: - name: height dtype: int64 - name: url dtype: string - name: width dtype: int64 - name: timestamp dtype: float64 - name: release_timestamp dtype: float64 - name: availability dtype: float64 - name: view_count dtype: float64 - name: live_status dtype: large_string - name: channel_is_verified dtype: bool - name: x_forwarded_for_ip dtype: 'null' - name: search_keyword dtype: large_string - name: is_cc dtype: bool - name: channel_follower_count dtype: float64 - name: playlist_count dtype: float64 - name: concurrent_view_count dtype: float64 - name: title_desc_langs list: string - name: __index_level_0__ dtype: int64 splits: - name: train num_bytes: 220241040 num_examples: 233362 download_size: 111234627 dataset_size: 220241040 - config_name: search_results_uig features: - name: type dtype: large_string - name: ie_key dtype: large_string - name: id dtype: large_string - name: url dtype: large_string - name: title dtype: large_string - name: description dtype: large_string - name: duration dtype: float64 - name: channel_id dtype: large_string - name: channel dtype: large_string - name: channel_url dtype: large_string - name: uploader dtype: large_string - name: uploader_id dtype: large_string - name: uploader_url dtype: large_string - name: thumbnails list: - name: height dtype: int64 - name: url dtype: string - name: width dtype: int64 - name: timestamp dtype: 'null' - name: release_timestamp dtype: float64 - name: availability dtype: 'null' - name: view_count dtype: float64 - name: live_status dtype: large_string - name: channel_is_verified dtype: bool - name: x_forwarded_for_ip dtype: 'null' - name: search_keyword dtype: large_string - name: is_cc dtype: bool - name: concurrent_view_count dtype: float64 - name: title_desc_langs list: string - name: __index_level_0__ dtype: int64 splits: - name: train num_bytes: 11434074 num_examples: 11546 download_size: 5904690 dataset_size: 11434074 - config_name: search_results_xho features: - name: type dtype: large_string - name: ie_key dtype: large_string - name: id dtype: large_string - name: url dtype: large_string - name: title dtype: large_string - name: description dtype: large_string - name: duration dtype: float64 - name: channel_id dtype: large_string - name: channel dtype: large_string - name: channel_url dtype: large_string - name: uploader dtype: large_string - name: uploader_id dtype: large_string - name: uploader_url dtype: large_string - name: thumbnails list: - name: height dtype: int64 - name: url dtype: string - name: width dtype: int64 - name: timestamp dtype: float64 - name: release_timestamp dtype: float64 - name: availability dtype: float64 - name: view_count dtype: float64 - name: live_status dtype: large_string - name: channel_is_verified dtype: bool - name: x_forwarded_for_ip dtype: 'null' - name: search_keyword dtype: large_string - name: is_cc dtype: bool - name: channel_follower_count dtype: float64 - name: playlist_count dtype: float64 - name: title_desc_langs list: string - name: concurrent_view_count dtype: float64 - name: __index_level_0__ dtype: int64 splits: - name: train num_bytes: 41159556 num_examples: 42661 download_size: 21233527 dataset_size: 41159556 configs: - config_name: all_audio_ban data_files: - split: train path: all_audio_ban/train-* - config_name: all_audio_cym data_files: - split: train path: all_audio_cym/train-* - config_name: all_audio_gle data_files: - split: train path: all_audio_gle/train-* - config_name: all_audio_uig data_files: - split: train path: all_audio_uig/train-* - config_name: all_audio_xho data_files: - split: train path: all_audio_xho/train-* - config_name: all_segments_ban data_files: - split: train path: all_segments_ban/train-* - config_name: all_segments_cym data_files: - split: train path: all_segments_cym/train-* - config_name: all_segments_gle data_files: - split: train path: all_segments_gle/train-* - config_name: all_segments_uig data_files: - split: train path: all_segments_uig/train-* - config_name: all_segments_xho data_files: - split: train path: all_segments_xho/train-* - config_name: keywords_ban data_files: - split: train path: keywords_ban/train-* - config_name: keywords_cym data_files: - split: train path: keywords_cym/train-* - config_name: keywords_gle data_files: - split: train path: keywords_gle/train-* - config_name: keywords_uig data_files: - split: train path: keywords_uig/train-* - config_name: keywords_xho data_files: - split: train path: keywords_xho/train-* - config_name: search_results_ban data_files: - split: train path: search_results_ban/train-* - config_name: search_results_cym data_files: - split: train path: search_results_cym/train-* - config_name: search_results_gle data_files: - split: train path: search_results_gle/train-* - config_name: search_results_gug data_files: - split: train path: search_results_gug/train-* - config_name: search_results_uig data_files: - split: train path: search_results_uig/train-* - config_name: search_results_xho data_files: - split: train path: search_results_xho/train-* ---
提供机构:
ptrdvn
5,000+
优质数据集
54 个
任务类型
进入经典数据集
二维码
社区交流群

面向社区/商业的数据集话题

二维码
科研交流群

面向高校/科研机构的开源数据集话题

数据驱动未来

携手共赢发展

商业合作