Skip to content

Translation

labl.datasets.translation.load_qe4pe

load_qe4pe(
    configs: Qe4peTask | list[Qe4peTask] = "main",
    langs: Qe4peLanguage | list[Qe4peLanguage] = [
        "ita",
        "nld",
    ],
    domains: Qe4peDomain | list[Qe4peDomain] | None = None,
    speed_groups: Qe4peSpeedGroup
    | list[Qe4peSpeedGroup]
    | None = None,
    highlight_modalities: Qe4peHighlightModality
    | list[Qe4peHighlightModality]
    | None = None,
    tokenizer: str
    | Tokenizer
    | PreTrainedTokenizer
    | PreTrainedTokenizerFast
    | None = None,
    tokenizer_kwargs: dict[str, Any] = {},
    filter_issues: bool = True,
    with_gaps: bool = True,
    sub_label: str = "S",
    ins_label: str = "I",
    del_label: str = "D",
    gap_token: str = "▁",
) -> dict[str, dict[str, EditedDataset]]

Load the QE4PE dataset by Sarti et al. (2025), containing multiple edits over a single set of machine-translated sentences in two languages (Italian and Dutch).

Parameters:

Name Type Description Default
configs Literal["pretask", "main", "posttask"] | list[Literal["pretask", "main", "posttask"]], *optional*

One or more task configurations to load. Defaults to "main". Available options: "pretask", "main", "posttask".

'main'
langs Literal["ita", "nld"] | list[Literal["ita", "nld"]], *optional*

One or more languages to load. Defaults to ["ita", "nld"]. Available options: "ita", "nld".

['ita', 'nld']
domains Literal["biomedical", "social"] | list[Literal["biomedical", "social"]] | None, *optional*

One or more text categories to load. Defaults to ["biomedical", "social"]. Available options: "biomedical", "social".

None
speed_groups Literal["faster", "avg", "slower"] | list[Literal["faster", "avg", "slower"]] | None, *optional*

One or more translator speed groups to load. Defaults to ["faster", "avg", "slower"]. Available options: "faster", "avg", "slower".

None
highlight_modalities Literal["no_highlight", "oracle", "supervised", "unsupervised"] | list[Literal["no_highlight", "oracle", "supervised", "unsupervised"]] | None, *optional*

One or more highlight modalities to load. Defaults to all modalities. Available options: "no_highlight", "oracle", "supervised", "unsupervised".

None
filter_issues bool, *optional*

Whether to filter out issues from the dataset. Defaults to True.

True
tokenizer str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast, *optional*

The tokenizer to use for tokenization. If None, a default whitespace tokenizer will be used.

None
tokenizer_kwargs dict[str, Any], *optional*

Additional arguments for the tokenizer.

{}
with_gaps bool, *optional*

Whether to include gaps in the tokenization. Defaults to True.

True
sub_label str, *optional*

The label for substitutions. Defaults to "S".

'S'
ins_label str, *optional*

The label for insertions. Defaults to "I".

'I'
del_label str, *optional*

The label for deletions. Defaults to "D".

'D'
gap_token str, *optional*

The token used for gaps. Defaults to "▁".

'▁'

Returns:

Type Description
dict[str, dict[str, EditedDataset]]

A dictionary containing the loaded datasets for each task and language. The keys are the task configurations, and the values are dictionaries with language keys and EditedDataset objects as values. E.g. load_qe4pe_dataset()["main"]["ita"] returns the EditedDataset for the main task for Italian.

Source code in labl/datasets/translation/qe4pe.py
def load_qe4pe(
    configs: Qe4peTask | list[Qe4peTask] = "main",
    langs: Qe4peLanguage | list[Qe4peLanguage] = ["ita", "nld"],
    domains: Qe4peDomain | list[Qe4peDomain] | None = None,
    speed_groups: Qe4peSpeedGroup | list[Qe4peSpeedGroup] | None = None,
    highlight_modalities: Qe4peHighlightModality | list[Qe4peHighlightModality] | None = None,
    tokenizer: str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None = None,
    tokenizer_kwargs: dict[str, Any] = {},
    filter_issues: bool = True,
    with_gaps: bool = True,
    sub_label: str = "S",
    ins_label: str = "I",
    del_label: str = "D",
    gap_token: str = "▁",
) -> dict[str, dict[str, EditedDataset]]:
    """Load the QE4PE dataset by [Sarti et al. (2025)](https://arxiv.org/abs/2503.03044), containing multiple edits
        over a single set of machine-translated sentences in two languages (Italian and Dutch).

    Args:
        configs (Literal["pretask", "main", "posttask"] | list[Literal["pretask", "main", "posttask"]], *optional*):
            One or more task configurations to load. Defaults to "main".
            Available options: "pretask", "main", "posttask".
        langs (Literal["ita", "nld"] | list[Literal["ita", "nld"]], *optional*):
            One or more languages to load. Defaults to ["ita", "nld"].
            Available options: "ita", "nld".
        domains (Literal["biomedical", "social"] | list[Literal["biomedical", "social"]] | None, *optional*):
            One or more text categories to load. Defaults to ["biomedical", "social"].
            Available options: "biomedical", "social".
        speed_groups (Literal["faster", "avg", "slower"] | list[Literal["faster", "avg", "slower"]] | None, *optional*):
            One or more translator speed groups to load. Defaults to ["faster", "avg", "slower"].
            Available options: "faster", "avg", "slower".
        highlight_modalities (Literal["no_highlight", "oracle", "supervised", "unsupervised"] | list[Literal["no_highlight", "oracle", "supervised", "unsupervised"]] | None, *optional*):
            One or more highlight modalities to load. Defaults to all modalities.
            Available options: "no_highlight", "oracle", "supervised", "unsupervised".
        filter_issues (bool, *optional*):
            Whether to filter out issues from the dataset. Defaults to True.
        tokenizer (str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast, *optional*):
            The tokenizer to use for tokenization. If None, a default whitespace tokenizer will be used.
        tokenizer_kwargs (dict[str, Any], *optional*):
            Additional arguments for the tokenizer.
        with_gaps (bool, *optional*):
            Whether to include gaps in the tokenization. Defaults to True.
        sub_label (str, *optional*):
            The label for substitutions. Defaults to "S".
        ins_label (str, *optional*):
            The label for insertions. Defaults to "I".
        del_label (str, *optional*):
            The label for deletions. Defaults to "D".
        gap_token (str, *optional*):
            The token used for gaps. Defaults to "▁".

    Returns:
        A dictionary containing the loaded datasets for each task and language.
            The keys are the task configurations, and the values are dictionaries with language keys
            and `EditedDataset` objects as values. E.g. `load_qe4pe_dataset()["main"]["ita"]` returns the
            `EditedDataset` for the main task for Italian.
    """
    if not is_datasets_available() or not is_pandas_available():
        raise RuntimeError("The `datasets` library is not installed. Please install it to use this function.")
    import pandas as pd

    from datasets import DatasetDict, load_dataset

    if isinstance(configs, str):
        configs = [configs]
    if isinstance(langs, str):
        langs = [langs]
    if domains is None:
        domains = ["biomedical", "social"]
    if isinstance(domains, str):
        domains = [domains]
    if speed_groups is None:
        speed_groups = ["faster", "avg", "slower"]
    if isinstance(speed_groups, str):
        speed_groups = [speed_groups]
    if highlight_modalities is None:
        highlight_modalities = ["no_highlight", "oracle", "supervised", "unsupervised"]
    if isinstance(highlight_modalities, str):
        highlight_modalities = [highlight_modalities]
    out_dict = {}
    for config in configs:
        dataset = cast(DatasetDict, load_dataset("gsarti/qe4pe", config))
        df = cast(pd.DataFrame, dataset["train"].to_pandas())
        if filter_issues:
            df = df[(~df["has_issue"]) & (df["translator_main_id"] != "no_highlight_t4")]
        out_dict[config] = {}
        for lang in langs:
            print(f"Loading {config} task for eng->{lang}...")
            lang_df = df[(df["tgt_lang"] == lang) & df["wmt_category"].isin(domains)]
            lang_df = lang_df[
                lang_df["translator_main_id"].str.endswith(tuple(SPEED_MAP[g] for g in speed_groups))
                | lang_df["highlight_modality"].isin(highlight_modalities)
            ]
            labl_dataset = EditedDataset.from_edits_dataframe(
                lang_df,
                text_column="mt_text",
                edit_column="pe_text",
                entry_ids=["doc_id", "segment_in_doc_id"],
                infos_columns=[
                    "wmt_category",
                    "doc_id",
                    "segment_in_doc_id",
                    "translator_main_id",
                    "highlight_modality",
                ],
                tokenizer=tokenizer,
                tokenizer_kwargs=tokenizer_kwargs,
                with_gaps=with_gaps,
                sub_label=sub_label,
                ins_label=ins_label,
                del_label=del_label,
                gap_token=gap_token,
            )
            out_dict[config][lang] = labl_dataset
    return out_dict

labl.datasets.translation.load_divemt

load_divemt(
    configs: DivemtTask | list[DivemtTask] = "main",
    langs: DivemtLanguage | list[DivemtLanguage] = [
        "ara",
        "nld",
        "ita",
        "tur",
        "ukr",
        "vie",
    ],
    mt_models: DivemtMTModel | list[DivemtMTModel] = [
        "gtrans",
        "mbart50",
    ],
    tokenizer: str
    | Tokenizer
    | PreTrainedTokenizer
    | PreTrainedTokenizerFast
    | None = None,
    tokenizer_kwargs: dict[str, Any] = {},
    with_gaps: bool = True,
    sub_label: str = "S",
    ins_label: str = "I",
    del_label: str = "D",
    gap_token: str = "▁",
) -> dict[str, dict[str, dict[str, EditedDataset]]]

Load the DivEMT dataset by Sarti et al. (2022), containing edits over two sets of machine-translated sentences across six typologically diverse languages.

Parameters:

Name Type Description Default
configs Literal["warmup", "main"] | list[Literal["warmup", "main"]], *optional*

One or more task configurations to load. Defaults to "main". Available options: "warmup", "main".

'main'
langs Literal["ara", "nld", "ita", "tur", "ukr", "vie"] | list[Literal["ara", "nld", "ita", "tur", "ukr", "vie"]], *optional*

One or more languages to load. Defaults to ["ara", "nld", "ita", "tur", "ukr", "vie"]. Available options: "ara", "nld", "ita", "tur", "ukr", "vie".

['ara', 'nld', 'ita', 'tur', 'ukr', 'vie']
mt_models Literal["gtrans", "mbart50"] | list[Literal["gtrans", "mbart50"]], *optional*

One or more models for which post-edits need to be loaded. Defaults to ["gtrans", "mbart50"]. Available options: "gtrans", "mbart50".

['gtrans', 'mbart50']
tokenizer str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast, *optional*

The tokenizer to use for tokenization. If None, a default whitespace tokenizer will be used.

None
tokenizer_kwargs dict[str, Any], *optional*

Additional arguments for the tokenizer.

{}
with_gaps bool, *optional*

Whether to include gaps in the tokenization. Defaults to True.

True
sub_label str, *optional*

The label for substitutions. Defaults to "S".

'S'
ins_label str, *optional*

The label for insertions. Defaults to "I".

'I'
del_label str, *optional*

The label for deletions. Defaults to "D".

'D'
gap_token str, *optional*

The token used for gaps. Defaults to "▁".

'▁'

Returns:

Type Description
dict[str, dict[str, dict[str, EditedDataset]]]

A dictionary containing the loaded datasets for each task, language, and MT model. The keys are the task configurations, and the values are dictionaries with language keys and EditedDataset objects as values. E.g. load_divemt_dataset()["main"]["ita"]["mbart50"] returns the EditedDataset for the main task for Italian.

Source code in labl/datasets/translation/divemt.py
def load_divemt(
    configs: DivemtTask | list[DivemtTask] = "main",
    langs: DivemtLanguage | list[DivemtLanguage] = ["ara", "nld", "ita", "tur", "ukr", "vie"],
    mt_models: DivemtMTModel | list[DivemtMTModel] = ["gtrans", "mbart50"],
    tokenizer: str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None = None,
    tokenizer_kwargs: dict[str, Any] = {},
    with_gaps: bool = True,
    sub_label: str = "S",
    ins_label: str = "I",
    del_label: str = "D",
    gap_token: str = "▁",
) -> dict[str, dict[str, dict[str, EditedDataset]]]:
    """Load the DivEMT dataset by [Sarti et al. (2022)](https://aclanthology.org/2022.emnlp-main.532/), containing edits
        over two sets of machine-translated sentences across six typologically diverse languages.

    Args:
        configs (Literal["warmup", "main"] | list[Literal["warmup", "main"]], *optional*):
            One or more task configurations to load. Defaults to "main".
            Available options: "warmup", "main".
        langs (Literal["ara", "nld", "ita", "tur", "ukr", "vie"] | list[Literal["ara", "nld", "ita", "tur", "ukr", "vie"]], *optional*):
            One or more languages to load. Defaults to ["ara", "nld", "ita", "tur", "ukr", "vie"].
            Available options: "ara", "nld", "ita", "tur", "ukr", "vie".
        mt_models (Literal["gtrans", "mbart50"] | list[Literal["gtrans", "mbart50"]], *optional*):
            One or more models for which post-edits need to be loaded. Defaults to ["gtrans", "mbart50"].
            Available options: "gtrans", "mbart50".
        tokenizer (str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast, *optional*):
            The tokenizer to use for tokenization. If None, a default whitespace tokenizer will be used.
        tokenizer_kwargs (dict[str, Any], *optional*):
            Additional arguments for the tokenizer.
        with_gaps (bool, *optional*):
            Whether to include gaps in the tokenization. Defaults to True.
        sub_label (str, *optional*):
            The label for substitutions. Defaults to "S".
        ins_label (str, *optional*):
            The label for insertions. Defaults to "I".
        del_label (str, *optional*):
            The label for deletions. Defaults to "D".
        gap_token (str, *optional*):
            The token used for gaps. Defaults to "▁".

    Returns:
        A dictionary containing the loaded datasets for each task, language, and MT model.
            The keys are the task configurations, and the values are dictionaries with language keys
            and `EditedDataset` objects as values. E.g. `load_divemt_dataset()["main"]["ita"]["mbart50"]` returns the
            `EditedDataset` for the main task for Italian.
    """
    if not is_datasets_available() or not is_pandas_available():
        raise RuntimeError("The `datasets` library is not installed. Please install it to use this function.")
    import pandas as pd

    from datasets import DatasetDict, load_dataset

    if isinstance(configs, str):
        configs = [configs]
    if isinstance(langs, str):
        langs = [langs]
    if isinstance(mt_models, str):
        mt_models = [mt_models]
    out_dict = {}
    for config in configs:
        dataset = cast(DatasetDict, load_dataset("GroNLP/divemt", config))
        df = cast(pd.DataFrame, dataset["train"].to_pandas())
        out_dict[config] = {}
        for lang in langs:
            out_dict[config][lang] = {}
            for model in mt_models:
                print(f"Loading {config} task for eng->{lang} {model} edits...")
                filter_df = df[(df["lang_id"] == lang) & (df["task_type"] == MT_MODEL_MAP[model])]
                labl_dataset = EditedDataset.from_edits_dataframe(
                    filter_df,
                    text_column="mt_text",
                    edit_column="tgt_text",
                    entry_ids="item_id",
                    infos_columns=["doc_id", "subject_id", "item_id"],
                    tokenizer=tokenizer,
                    tokenizer_kwargs=tokenizer_kwargs,
                    with_gaps=with_gaps,
                    sub_label=sub_label,
                    ins_label=ins_label,
                    del_label=del_label,
                    gap_token=gap_token,
                )
                out_dict[config][lang][model] = labl_dataset
    return out_dict

labl.datasets.translation.load_wmt24esa

load_wmt24esa(
    langs: Wmt24EsaLanguage
    | list[Wmt24EsaLanguage]
    | None = None,
    domains: Wmt24EsaDomain
    | list[Wmt24EsaDomain]
    | None = None,
    mt_models: Wmt24EsaMTModel
    | list[Wmt24EsaMTModel]
    | None = None,
    tokenizer: str
    | Tokenizer
    | PreTrainedTokenizer
    | PreTrainedTokenizerFast
    | None = None,
    tokenizer_kwargs: dict[str, Any] = {},
) -> dict[str, dict[str, LabeledDataset]]

Load the WMT24 ESA annotations from Kocmi et al. (2024), containing partially overlapping segments across multiple language pairs with a single set of ESA annotations over multiple MT system outputs.

Parameters:

Name Type Description Default
langs Wmt24EsaLanguage | list[Wmt24EsaLanguage] | None

One or more languages to load. Defaults to ["en-cs", "en-ja", "en-es", "en-zh", "en-hi", "en-is", "cs-uk", "en-uk", "en-ru"]. Available options:"en-cs", "en-ja", "en-es", "en-zh", "en-hi", "en-is", "cs-uk", "en-uk", "en-ru"`.

None
domains Wmt24EsaDomain | list[Wmt24EsaDomain] | None

One or more text categories to load. Defaults to ["speech", "social", "news", "literary", "education", "voice", "personal", "official"]. Available options: "speech", "social", "news", "literary", "education", "voice", "personal", "official".

None
mt_models Wmt24EsaMTModel | list[Wmt24EsaMTModel] | None

One or more models for which annotations need to be loaded. Defaults to all models. Available options: "Unbabel-Tower70B", "CUNI-GA", "Gemini-1.5-Pro", "SCIR-MT", "Aya23", "Claude-3.5", "ONLINE-W", "Llama3-70B", "GPT-4", "CommandR-plus", "IKUN-C", "refA", "IOL-Research", "CUNI-DocTransformer", "IKUN", "CUNI-MH", "Mistral-Large", "ONLINE-B", "Dubformer", "MSLC", "Team-J", "HW-TSC", "NTTSU", "TranssionMT", "AMI", "CUNI-Transformer", "ONLINE-G", "Yandex".

None
tokenizer str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast, *optional*

The tokenizer to use for tokenization. If None, a default whitespace tokenizer will be used.

None
tokenizer_kwargs dict[str, Any], *optional*

Additional arguments for the tokenizer.

{}

Returns:

Type Description
dict[str, dict[str, LabeledDataset]]

A dictionary containing the loaded datasets for each MT model and language. The keys are the task configurations, and the values are dictionaries with language keys and EditedDataset objects as values. E.g. load_wmt24esa()["Aya23"]["en-cs"] returns the LabeledDataset for the Aya23 model for English-Czech.

Source code in labl/datasets/translation/wmt24_esa.py
def load_wmt24esa(
    langs: Wmt24EsaLanguage | list[Wmt24EsaLanguage] | None = None,
    domains: Wmt24EsaDomain | list[Wmt24EsaDomain] | None = None,
    mt_models: Wmt24EsaMTModel | list[Wmt24EsaMTModel] | None = None,
    tokenizer: str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None = None,
    tokenizer_kwargs: dict[str, Any] = {},
) -> dict[str, dict[str, LabeledDataset]]:
    """Load the WMT24 ESA annotations from [Kocmi et al. (2024)](https://aclanthology.org/2024.wmt-1.1/), containing
        partially overlapping segments across multiple language pairs with a single set of
        [ESA annotations](https://aclanthology.org/2024.wmt-1.131/) over multiple MT system outputs.

    Args:
        langs (Wmt24EsaLanguage | list[Wmt24EsaLanguage] | None):
            One or more languages to load. Defaults to `["en-cs", "en-ja", "en-es", "en-zh", "en-hi", "en-is", "cs-uk",
                "en-uk", "en-ru"]. Available options: `"en-cs", "en-ja", "en-es", "en-zh", "en-hi", "en-is", "cs-uk",
                "en-uk", "en-ru"`.
        domains (Wmt24EsaDomain | list[Wmt24EsaDomain] | None):
            One or more text categories to load. Defaults to `["speech", "social", "news", "literary", "education",
                "voice", "personal", "official"]`. Available options: `"speech", "social", "news", "literary",
                "education", "voice", "personal", "official"`.
        mt_models (Wmt24EsaMTModel | list[Wmt24EsaMTModel] | None):
            One or more models for which annotations need to be loaded. Defaults to all models.
            Available options: `"Unbabel-Tower70B", "CUNI-GA", "Gemini-1.5-Pro", "SCIR-MT", "Aya23", "Claude-3.5",
                "ONLINE-W", "Llama3-70B", "GPT-4", "CommandR-plus", "IKUN-C", "refA", "IOL-Research",
                "CUNI-DocTransformer", "IKUN", "CUNI-MH", "Mistral-Large", "ONLINE-B", "Dubformer", "MSLC",
                "Team-J", "HW-TSC", "NTTSU", "TranssionMT", "AMI", "CUNI-Transformer", "ONLINE-G", "Yandex"`.
        tokenizer (str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast, *optional*):
            The tokenizer to use for tokenization. If None, a default whitespace tokenizer will be used.
        tokenizer_kwargs (dict[str, Any], *optional*):
            Additional arguments for the tokenizer.

    Returns:
        A dictionary containing the loaded datasets for each MT model and language.
            The keys are the task configurations, and the values are dictionaries with language keys
            and `EditedDataset` objects as values. E.g. `load_wmt24esa()["Aya23"]["en-cs"]` returns
            the `LabeledDataset` for the Aya23 model for English-Czech.
    """
    if not is_pandas_available():
        raise RuntimeError("The `pandas` library is not installed. Please install it to use this function.")

    if langs is None:
        langs = ["en-cs", "en-ja", "en-es", "en-zh", "en-hi", "en-is", "cs-uk", "en-uk", "en-ru"]
    if isinstance(langs, str):
        langs = [langs]
    if domains is None:
        domains = ["speech", "social", "news", "literary", "education", "voice", "personal", "official"]
    if isinstance(domains, str):
        domains = [domains]
    if mt_models is None:
        mt_models = [
            "Unbabel-Tower70B",
            "CUNI-GA",
            "Gemini-1.5-Pro",
            "SCIR-MT",
            "Aya23",
            "Claude-3.5",
            "ONLINE-W",
            "Llama3-70B",
            "GPT-4",
            "CommandR-plus",
            "IKUN-C",
            "refA",
            "IOL-Research",
            "CUNI-DocTransformer",
            "IKUN",
            "CUNI-MH",
            "Mistral-Large",
            "ONLINE-B",
            "Dubformer",
            "MSLC",
            "Team-J",
            "HW-TSC",
            "NTTSU",
            "TranssionMT",
            "AMI",
            "CUNI-Transformer",
            "ONLINE-G",
            "Yandex",
        ]
    if isinstance(mt_models, str):
        mt_models = [mt_models]
    out_dict = {}
    df = load_cached_or_download(
        url="https://raw.githubusercontent.com/wmt-conference/wmt24-news-systems/refs/heads/main/jsonl/wmt24_esa.jsonl",
        filetype="jsonl",
    )
    for model in mt_models:
        out_dict[model] = {}
        for lang in langs:
            filter_df = df[df["domain"].isin(domains) & (df["system"] == model) & (df["langs"] == lang)]
            all_spans = []
            all_infos = []
            if not filter_df.empty:
                print(f"Loading {model} annotations for {lang}...")
                for _, row in filter_df.iterrows():
                    spans = []
                    infos = {c: row[c] for c in ["line_id", "doc_id", "domain", "esa_score", "annotator"]}
                    for span in row["esa_spans"]:
                        start, end = span["start_i"], span["end_i"]
                        if isinstance(start, int) and isinstance(end, int) and start < end:
                            spans.append(
                                Span(
                                    start=span["start_i"],
                                    end=span["end_i"],
                                    label=span["severity"],
                                    text=row["tgt"][start:end],
                                )
                            )
                    all_spans.append(spans)
                    all_infos.append(infos)
                labl_dataset = LabeledDataset.from_spans(
                    texts=list(filter_df["tgt"]),
                    spans=all_spans,
                    infos=all_infos,
                    tokenizer=tokenizer,
                    tokenizer_kwargs=tokenizer_kwargs,
                )
                out_dict[model][lang] = labl_dataset
    return out_dict