mteb/EmotionAnalysis
收藏Hugging Face2026-04-19 更新2026-04-26 收录
下载链接:
https://hf-mirror.com/datasets/mteb/EmotionAnalysis
下载链接
链接失效反馈官方服务:
资源简介:
---
annotations_creators:
- human-annotated
language:
- afr
- amh
- arq
- ary
- cdo
- deu
- eng
- gaz
- hau
- hin
- ibo
- ind
- jav
- kin
- mar
- pcm
- ron
- rus
- som
- spa
- sun
- swa
- swe
- tat
- tir
- ukr
- vmw
- xho
- yor
- zul
license: cc-by-4.0
multilinguality: multilingual
source_datasets:
- llama-lang-adapt/EmotionAnalysisFinal
task_categories:
- text-classification
task_ids:
- sentiment-classification
dataset_info:
- config_name: afr
features:
- name: text
dtype: string
- name: label
sequence: int64
splits:
- name: validation
num_bytes: 0
num_examples: 0
- name: test
num_bytes: 108045.0
num_examples: 1065
- name: train
num_bytes: 10095.0
num_examples: 98
download_size: 78096
dataset_size: 118140.0
- config_name: amh
features:
- name: text
dtype: string
- name: label
sequence: int64
splits:
- name: validation
num_bytes: 0
num_examples: 0
- name: test
num_bytes: 469331.2807215333
num_examples: 1772
- name: train
num_bytes: 159424.0
num_examples: 592
download_size: 341676
dataset_size: 628755.2807215333
- config_name: arq
features:
- name: text
dtype: string
- name: label
sequence: int64
splits:
- name: validation
num_bytes: 0
num_examples: 0
- name: test
num_bytes: 144068.1019955654
num_examples: 901
- name: train
num_bytes: 16253.0
num_examples: 100
download_size: 81131
dataset_size: 160321.1019955654
- config_name: ary
features:
- name: text
dtype: string
- name: label
sequence: int64
splits:
- name: validation
num_bytes: 0
num_examples: 0
- name: test
num_bytes: 116400.35467980296
num_examples: 809
- name: train
num_bytes: 37747.0
num_examples: 267
download_size: 94836
dataset_size: 154147.35467980296
- config_name: chn
features:
- name: text
dtype: string
- name: label
sequence: int64
splits:
- name: validation
num_bytes: 0
num_examples: 0
- name: test
num_bytes: 16913.339894019682
num_examples: 92
- name: train
num_bytes: 883.725
num_examples: 5
download_size: 16739
dataset_size: 17797.06489401968
- config_name: deu
features:
- name: text
dtype: string
- name: label
sequence: int64
splits:
- name: validation
num_bytes: 0
num_examples: 0
- name: test
num_bytes: 636845.2258064516
num_examples: 2598
- name: train
num_bytes: 48760.0
num_examples: 200
download_size: 447925
dataset_size: 685605.2258064516
- config_name: eng
features:
- name: text
dtype: string
- name: label
sequence: int64
splits:
- name: validation
num_bytes: 0
num_examples: 0
- name: test
num_bytes: 275236.7705095772
num_examples: 2754
- name: train
num_bytes: 11081.637931034482
num_examples: 115
download_size: 174757
dataset_size: 286318.4084406117
- config_name: esp
features:
- name: text
dtype: string
- name: label
sequence: int64
splits:
- name: validation
num_bytes: 0
num_examples: 0
- name: test
num_bytes: 117654.0
num_examples: 1695
- name: train
num_bytes: 14366.0
num_examples: 184
download_size: 83426
dataset_size: 132020.0
- config_name: gaz
features:
- name: text
dtype: string
- name: label
sequence: int64
splits:
- name: validation
num_bytes: 0
num_examples: 0
- name: test
num_bytes: 279017.94770482276
num_examples: 1691
- name: train
num_bytes: 97310.73519163764
num_examples: 571
download_size: 254657
dataset_size: 376328.6828964604
- config_name: hau
features:
- name: text
dtype: string
- name: label
sequence: int64
splits:
- name: validation
num_bytes: 0
num_examples: 0
- name: test
num_bytes: 102892.0
num_examples: 1080
- name: train
num_bytes: 34146.0
num_examples: 356
download_size: 91729
dataset_size: 137038.0
- config_name: hin
features:
- name: text
dtype: string
- name: label
sequence: int64
splits:
- name: validation
num_bytes: 0
num_examples: 0
- name: test
num_bytes: 217653.0
num_examples: 1010
- name: train
num_bytes: 21462.0
num_examples: 100
download_size: 106776
dataset_size: 239115.0
- config_name: ibo
features:
- name: text
dtype: string
- name: label
sequence: int64
splits:
- name: validation
num_bytes: 0
num_examples: 0
- name: test
num_bytes: 115990.25831024931
num_examples: 1411
- name: train
num_bytes: 39216.95615866388
num_examples: 478
download_size: 88887
dataset_size: 155207.21446891318
- config_name: ind
features:
- name: text
dtype: string
- name: label
sequence: int64
splits:
- name: validation
num_bytes: 0
num_examples: 0
- name: test
num_bytes: 87773.0
num_examples: 851
- name: train
num_bytes: 15285.0
num_examples: 156
download_size: 67219
dataset_size: 103058.0
- config_name: jav
features:
- name: text
dtype: string
- name: label
sequence: int64
splits:
- name: validation
num_bytes: 0
num_examples: 0
- name: test
num_bytes: 93396.0
num_examples: 837
- name: train
num_bytes: 16121.0
num_examples: 151
download_size: 73615
dataset_size: 109517.0
- config_name: kin
features:
- name: text
dtype: string
- name: label
sequence: int64
splits:
- name: validation
num_bytes: 0
num_examples: 0
- name: test
num_bytes: 183324.3476848091
num_examples: 1226
- name: train
num_bytes: 61004.74201474201
num_examples: 405
download_size: 160281
dataset_size: 244329.0896995511
- config_name: mar
features:
- name: text
dtype: string
- name: label
sequence: int64
splits:
- name: validation
num_bytes: 0
num_examples: 0
- name: test
num_bytes: 234673.0
num_examples: 1000
- name: train
num_bytes: 21003.0
num_examples: 100
download_size: 105651
dataset_size: 255676.0
- config_name: pcm
features:
- name: text
dtype: string
- name: label
sequence: int64
splits:
- name: validation
num_bytes: 0
num_examples: 0
- name: test
num_bytes: 245948.0
num_examples: 1870
- name: train
num_bytes: 85256.0
num_examples: 620
download_size: 207816
dataset_size: 331204.0
- config_name: ron
features:
- name: text
dtype: string
- name: label
sequence: int64
splits:
- name: validation
num_bytes: 0
num_examples: 0
- name: test
num_bytes: 140555.2797140304
num_examples: 1118
- name: train
num_bytes: 15305.0
num_examples: 123
download_size: 103058
dataset_size: 155860.2797140304
- config_name: rus
features:
- name: text
dtype: string
- name: label
sequence: int64
splits:
- name: validation
num_bytes: 0
num_examples: 0
- name: test
num_bytes: 118911.002
num_examples: 947
- name: train
num_bytes: 22367.48743718593
num_examples: 190
download_size: 85267
dataset_size: 141278.48943718593
- config_name: som
features:
- name: text
dtype: string
- name: label
sequence: int64
splits:
- name: validation
num_bytes: 0
num_examples: 0
- name: test
num_bytes: 289372.23172169813
num_examples: 1693
- name: train
num_bytes: 95307.8445229682
num_examples: 560
download_size: 261147
dataset_size: 384680.07624466636
- config_name: sun
features:
- name: text
dtype: string
- name: label
sequence: int64
splits:
- name: validation
num_bytes: 0
num_examples: 0
- name: test
num_bytes: 88659.0
num_examples: 926
- name: train
num_bytes: 19124.0
num_examples: 199
download_size: 70761
dataset_size: 107783.0
- config_name: swe
features:
- name: text
dtype: string
- name: label
sequence: int64
splits:
- name: validation
num_bytes: 0
num_examples: 0
- name: test
num_bytes: 239988.18686868687
num_examples: 1149
- name: train
num_bytes: 38948.365
num_examples: 193
download_size: 182457
dataset_size: 278936.55186868686
- config_name: swh
features:
- name: text
dtype: string
- name: label
sequence: int64
splits:
- name: validation
num_bytes: 0
num_examples: 0
- name: test
num_bytes: 153837.1497584541
num_examples: 1640
- name: train
num_bytes: 51577.10526315789
num_examples: 549
download_size: 145852
dataset_size: 205414.25502161199
- config_name: tat
features:
- name: text
dtype: string
- name: label
sequence: int64
splits:
- name: validation
num_bytes: 0
num_examples: 0
- name: test
num_bytes: 118466.28
num_examples: 958
- name: train
num_bytes: 23679.825
num_examples: 195
download_size: 85339
dataset_size: 142146.105
- config_name: tir
features:
- name: text
dtype: string
- name: label
sequence: int64
splits:
- name: validation
num_bytes: 0
num_examples: 0
- name: test
num_bytes: 467038.79673913046
num_examples: 1838
- name: train
num_bytes: 159966.0
num_examples: 614
download_size: 348855
dataset_size: 627004.7967391305
- config_name: ukr
features:
- name: text
dtype: string
- name: label
sequence: int64
splits:
- name: validation
num_bytes: 0
num_examples: 0
- name: test
num_bytes: 259638.72649955237
num_examples: 2233
- name: train
num_bytes: 29586.0
num_examples: 249
download_size: 170714
dataset_size: 289224.7264995524
- config_name: vmw
features:
- name: text
dtype: string
- name: label
sequence: int64
splits:
- name: validation
num_bytes: 0
num_examples: 0
- name: test
num_bytes: 73160.0
num_examples: 777
- name: train
num_bytes: 23938.0
num_examples: 258
download_size: 66824
dataset_size: 97098.0
- config_name: xho
features:
- name: text
dtype: string
- name: label
sequence: int64
splits:
- name: validation
num_bytes: 0
num_examples: 0
- name: test
num_bytes: 104948.0
num_examples: 1594
- name: train
num_bytes: 44368.0
num_examples: 682
download_size: 95386
dataset_size: 149316.0
- config_name: yor
features:
- name: text
dtype: string
- name: label
sequence: int64
splits:
- name: validation
num_bytes: 0
num_examples: 0
- name: test
num_bytes: 170003.0
num_examples: 1500
- name: train
num_bytes: 54902.0
num_examples: 497
download_size: 144492
dataset_size: 224905.0
- config_name: zul
features:
- name: text
dtype: string
- name: label
sequence: int64
splits:
- name: validation
num_bytes: 0
num_examples: 0
- name: test
num_bytes: 135132.3106985833
num_examples: 2040
- name: train
num_bytes: 58290.45942857143
num_examples: 873
download_size: 136624
dataset_size: 193422.7701271547
configs:
- config_name: afr
data_files:
- split: validation
path: afr/validation-*
- split: test
path: afr/test-*
- split: train
path: afr/train-*
- config_name: amh
data_files:
- split: validation
path: amh/validation-*
- split: test
path: amh/test-*
- split: train
path: amh/train-*
- config_name: arq
data_files:
- split: validation
path: arq/validation-*
- split: test
path: arq/test-*
- split: train
path: arq/train-*
- config_name: ary
data_files:
- split: validation
path: ary/validation-*
- split: test
path: ary/test-*
- split: train
path: ary/train-*
- config_name: chn
data_files:
- split: validation
path: chn/validation-*
- split: test
path: chn/test-*
- split: train
path: chn/train-*
- config_name: deu
data_files:
- split: validation
path: deu/validation-*
- split: test
path: deu/test-*
- split: train
path: deu/train-*
- config_name: eng
data_files:
- split: validation
path: eng/validation-*
- split: test
path: eng/test-*
- split: train
path: eng/train-*
- config_name: esp
data_files:
- split: validation
path: esp/validation-*
- split: test
path: esp/test-*
- split: train
path: esp/train-*
- config_name: gaz
data_files:
- split: validation
path: gaz/validation-*
- split: test
path: gaz/test-*
- split: train
path: gaz/train-*
- config_name: hau
data_files:
- split: validation
path: hau/validation-*
- split: test
path: hau/test-*
- split: train
path: hau/train-*
- config_name: hin
data_files:
- split: validation
path: hin/validation-*
- split: test
path: hin/test-*
- split: train
path: hin/train-*
- config_name: ibo
data_files:
- split: validation
path: ibo/validation-*
- split: test
path: ibo/test-*
- split: train
path: ibo/train-*
- config_name: ind
data_files:
- split: validation
path: ind/validation-*
- split: test
path: ind/test-*
- split: train
path: ind/train-*
- config_name: jav
data_files:
- split: validation
path: jav/validation-*
- split: test
path: jav/test-*
- split: train
path: jav/train-*
- config_name: kin
data_files:
- split: validation
path: kin/validation-*
- split: test
path: kin/test-*
- split: train
path: kin/train-*
- config_name: mar
data_files:
- split: validation
path: mar/validation-*
- split: test
path: mar/test-*
- split: train
path: mar/train-*
- config_name: pcm
data_files:
- split: validation
path: pcm/validation-*
- split: test
path: pcm/test-*
- split: train
path: pcm/train-*
- config_name: ron
data_files:
- split: validation
path: ron/validation-*
- split: test
path: ron/test-*
- split: train
path: ron/train-*
- config_name: rus
data_files:
- split: validation
path: rus/validation-*
- split: test
path: rus/test-*
- split: train
path: rus/train-*
- config_name: som
data_files:
- split: validation
path: som/validation-*
- split: test
path: som/test-*
- split: train
path: som/train-*
- config_name: sun
data_files:
- split: validation
path: sun/validation-*
- split: test
path: sun/test-*
- split: train
path: sun/train-*
- config_name: swe
data_files:
- split: validation
path: swe/validation-*
- split: test
path: swe/test-*
- split: train
path: swe/train-*
- config_name: swh
data_files:
- split: validation
path: swh/validation-*
- split: test
path: swh/test-*
- split: train
path: swh/train-*
- config_name: tat
data_files:
- split: validation
path: tat/validation-*
- split: test
path: tat/test-*
- split: train
path: tat/train-*
- config_name: tir
data_files:
- split: validation
path: tir/validation-*
- split: test
path: tir/test-*
- split: train
path: tir/train-*
- config_name: ukr
data_files:
- split: validation
path: ukr/validation-*
- split: test
path: ukr/test-*
- split: train
path: ukr/train-*
- config_name: vmw
data_files:
- split: validation
path: vmw/validation-*
- split: test
path: vmw/test-*
- split: train
path: vmw/train-*
- config_name: xho
data_files:
- split: validation
path: xho/validation-*
- split: test
path: xho/test-*
- split: train
path: xho/train-*
- config_name: yor
data_files:
- split: validation
path: yor/validation-*
- split: test
path: yor/test-*
- split: train
path: yor/train-*
- config_name: zul
data_files:
- split: validation
path: zul/validation-*
- split: test
path: zul/test-*
- split: train
path: zul/train-*
tags:
- mteb
- text
---
<!-- adapted from https://github.com/huggingface/huggingface_hub/blob/v0.30.2/src/huggingface_hub/templates/datasetcard_template.md -->
<div align="center" style="padding: 40px 20px; background-color: white; border-radius: 12px; box-shadow: 0 2px 10px rgba(0, 0, 0, 0.05); max-width: 600px; margin: 0 auto;">
<h1 style="font-size: 3.5rem; color: #1a1a1a; margin: 0 0 20px 0; letter-spacing: 2px; font-weight: 700;">EmotionAnalysisPlus</h1>
<div style="font-size: 1.5rem; color: #4a4a4a; margin-bottom: 5px; font-weight: 300;">An <a href="https://github.com/embeddings-benchmark/mteb" style="color: #2c5282; font-weight: 600; text-decoration: none;" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">MTEB</a> dataset</div>
<div style="font-size: 0.9rem; color: #2c5282; margin-top: 10px;">Massive Text Embedding Benchmark</div>
</div>
Multi-label emotion classification dataset for 28 languages released with the BRIGHTER project and SemEval-2025 Task 11.
| | |
|---------------|---------------------------------------------|
| Task category | t2c |
| Domains | Social, Written |
| Reference | https://github.com/emotion-analysis-project/SemEval2025-Task11 |
Source datasets:
- [llama-lang-adapt/EmotionAnalysisFinal](https://huggingface.co/datasets/llama-lang-adapt/EmotionAnalysisFinal)
## Dataset Preparation in MTEB
This repository is a staging copy of `llama-lang-adapt/EmotionAnalysisFinal` for MTEB. The intended long-term canonical benchmark copy is `mteb/EmotionAnalysis`.
### Transformations
- Converted per-emotion indicator columns into the MTEB multi-label format: `label: list[int]`
- Preserved the task-facing subset names, including `gaz` and `swh`, while sourcing from the original Hub configs
- Backfilled a `train` split where missing for benchmark compatibility before cleaning
- Applied dataset cleaning before upload to reduce duplicates and copied-train overlap in the staging copy
### Label Schema
- `0`: anger
- `1`: disgust
- `2`: fear
- `3`: joy
- `4`: sadness
- `5`: surprise
### Splits and subsets
- Language-specific configs from the benchmark task are preserved
- The staged copy includes the cleaned train/validation/test-style splits used by `EmotionAnalysisPlus`
## How to evaluate on this task
You can evaluate an embedding model on this dataset using the following code:
```python
import mteb
task = mteb.get_task("EmotionAnalysisPlus")
evaluator = mteb.MTEB([task])
model = mteb.get_model(YOUR_MODEL)
evaluator.run(model)
```
<!-- Datasets want link to arxiv in readme to autolink dataset with paper -->
To learn more about how to run models on `mteb` task check out the [GitHub repository](https://github.com/embeddings-benchmark/mteb).
## Citation
If you use this dataset, please cite the dataset as well as [mteb](https://github.com/embeddings-benchmark/mteb), as this dataset likely includes additional processing as a part of the [MMTEB Contribution](https://github.com/embeddings-benchmark/mteb/tree/main/docs/mmteb).
```bibtex
@article{enevoldsen2025mmtebmassivemultilingualtext,
title={MMTEB: Massive Multilingual Text Embedding Benchmark},
author={Kenneth Enevoldsen and Isaac Chung and Imene Kerboua and Márton Kardos and Ashwin Mathur and David Stap and Jay Gala and Wissam Siblini and Dominik Krzemiński and Genta Indra Winata and Saba Sturua and Saiteja Utpala and Mathieu Ciancone and Marion Schaeffer and Gabriel Sequeira and Diganta Misra and Shreeya Dhakal and Jonathan Rystrøm and Roman Solomatin and Ömer Çağatan and Akash Kundu and Martin Bernstorff and Shitao Xiao and Akshita Sukhlecha and Bhavish Pahwa and Rafał Poświata and Kranthi Kiran GV and Shawon Ashraf and Daniel Auras and Björn Plüster and Jan Philipp Harries and Loïc Magne and Isabelle Mohr and Mariya Hendriksen and Dawei Zhu and Hippolyte Gisserot-Boukhlef and Tom Aarsen and Jan Kostkan and Konrad Wojtasik and Taemin Lee and Marek Šuppa and Crystina Zhang and Roberta Rocca and Mohammed Hamdy and Andrianos Michail and John Yang and Manuel Faysse and Aleksei Vatolin and Nandan Thakur and Manan Dey and Dipam Vasani and Pranjal Chitale and Simone Tedeschi and Nguyen Tai and Artem Snegirev and Michael Günther and Mengzhou Xia and Weijia Shi and Xing Han Lù and Jordan Clive and Gayatri Krishnakumar and Anna Maksimova and Silvan Wehrli and Maria Tikhonova and Henil Panchal and Aleksandr Abramov and Malte Ostendorff and Zheng Liu and Simon Clematide and Lester James Miranda and Alena Fenogenova and Guangyu Song and Ruqiya Bin Safi and Wen-Ding Li and Alessia Borghini and Federico Cassano and Hongjin Su and Jimmy Lin and Howard Yen and Lasse Hansen and Sara Hooker and Chenghao Xiao and Vaibhav Adlakha and Orion Weller and Siva Reddy and Niklas Muennighoff},
publisher = {arXiv},
journal={arXiv preprint arXiv:2502.13595},
year={2025},
url={https://arxiv.org/abs/2502.13595},
doi = {10.48550/arXiv.2502.13595},
}
@article{muennighoff2022mteb,
author = {Muennighoff, Niklas and Tazi, Nouamane and Magne, Loïc and Reimers, Nils},
title = {MTEB: Massive Text Embedding Benchmark},
publisher = {arXiv},
journal={arXiv preprint arXiv:2210.07316},
year = {2022}
url = {https://arxiv.org/abs/2210.07316},
doi = {10.48550/ARXIV.2210.07316},
}
```
# Dataset Statistics
<details>
<summary> Dataset Statistics</summary>
The following code contains the descriptive statistics from the task. These can also be obtained using:
```python
import mteb
task = mteb.get_task("EmotionAnalysisPlus")
desc_stats = task.metadata.descriptive_stats
```
```json
{
"validation": {
"num_samples": 9913,
"number_texts_intersect_with_train": 9912,
"text_statistics": {
"total_text_length": 913579,
"min_text_length": 6,
"average_text_length": 92.15968929688287,
"max_text_length": 2022,
"unique_texts": 9912
},
"image_statistics": null,
"audio_statistics": null,
"label_statistics": {
"min_labels_per_text": 0,
"average_label_per_text": 0.9135478664380107,
"max_labels_per_text": 5,
"unique_labels": 7,
"labels": {
"3": {
"count": 1956
},
"None": {
"count": 2679
},
"2": {
"count": 721
},
"0": {
"count": 1509
},
"1": {
"count": 1584
},
"4": {
"count": 2091
},
"5": {
"count": 1195
}
}
}
},
"test": {
"num_samples": 43882,
"number_texts_intersect_with_train": 17,
"text_statistics": {
"total_text_length": 4126105,
"min_text_length": 3,
"average_text_length": 94.02727769928444,
"max_text_length": 3779,
"unique_texts": 43848
},
"image_statistics": null,
"audio_statistics": null,
"label_statistics": {
"min_labels_per_text": 0,
"average_label_per_text": 1.0114169819060206,
"max_labels_per_text": 6,
"unique_labels": 7,
"labels": {
"3": {
"count": 9680
},
"None": {
"count": 10566
},
"4": {
"count": 9202
},
"2": {
"count": 4844
},
"0": {
"count": 7680
},
"1": {
"count": 6957
},
"5": {
"count": 6020
}
}
}
},
"train": {
"num_samples": 9913,
"number_texts_intersect_with_train": null,
"text_statistics": {
"total_text_length": 913579,
"min_text_length": 6,
"average_text_length": 92.15968929688287,
"max_text_length": 2022,
"unique_texts": 9912
},
"image_statistics": null,
"audio_statistics": null,
"label_statistics": {
"min_labels_per_text": 0,
"average_label_per_text": 0.9135478664380107,
"max_labels_per_text": 5,
"unique_labels": 7,
"labels": {
"3": {
"count": 1956
},
"None": {
"count": 2679
},
"2": {
"count": 721
},
"0": {
"count": 1509
},
"1": {
"count": 1584
},
"4": {
"count": 2091
},
"5": {
"count": 1195
}
}
}
}
}
```
</details>
---
*This dataset card was automatically generated using [MTEB](https://github.com/embeddings-benchmark/mteb)*
提供机构:
mteb



