Skip to content

Dataset

labl.data.base_sequence.BaseLabeledDataset

BaseLabeledDataset(
    iterable: Iterable[LabeledObject] | None = None,
    *,
    info: InfoDictType = {},
)

Bases: BaseLabeledSequence[EntryType | BaseMultiLabelEntry[EntryType]], ABC

Base class for all dataset classes containing BaseLabeledEntry objects.

Source code in labl/data/base_sequence.py
def __init__(self, iterable: Iterable[LabeledObject] | None = None, *, info: InfoDictType = {}):
    if iterable is None:
        super().__init__()
    else:
        super().__init__(iterable)
    self._label_types = self._get_label_types()
    self._info = info

get_agreement

get_agreement(
    other: BaseLabeledSequence[
        EntryType | BaseMultiLabelEntry[EntryType]
    ]
    | None = None,
    level_of_measurement: LevelOfMeasurement | None = None,
) -> AgreementOutput

Compute the inter-annotator agreement for the token labels of all label sets using Krippendorff's alpha.

Source code in labl/data/base_sequence.py
def get_agreement(
    self,
    other: BaseLabeledSequence[EntryType | BaseMultiLabelEntry[EntryType]] | None = None,
    level_of_measurement: LevelOfMeasurement | None = None,
) -> AgreementOutput:
    """Compute the inter-annotator agreement for the token labels of all label sets using
    [Krippendorff's alpha](https://en.wikipedia.org/wiki/Krippendorff%27s_alpha).
    """
    self._validate_single_label_type()
    if other is not None:
        other._validate_single_label_type()
        if self.label_types[0] != other.label_types[0]:
            raise RuntimeError(
                f"Label type does not match: {self.label_types[0]} vs {other.label_types[0]}.\n"
                "Transform the annotations using `.relabel` to ensure a single type is present."
            )
    labels_array = self._get_labels_array(other=other, dtype=self.label_types[0])
    return get_labels_agreement(
        label_type=self.label_types[0],
        labels_array=labels_array,
        level_of_measurement=level_of_measurement,
    )

labl.data.labeled_dataset.LabeledDataset

LabeledDataset(
    iterable: Iterable[LabeledObject] | None = None,
    *,
    info: InfoDictType = {},
)

Bases: BaseLabeledDataset[LabeledEntry]

Dataset class for handling collections of LabeledEntry objects.

Attributes:

Name Type Description
data list[LabeledEntry]

A list of LabeledEntry objects.

Source code in labl/data/base_sequence.py
def __init__(self, iterable: Iterable[LabeledObject] | None = None, *, info: InfoDictType = {}):
    if iterable is None:
        super().__init__()
    else:
        super().__init__(iterable)
    self._label_types = self._get_label_types()
    self._info = info

from_spans classmethod

from_spans(
    texts: list[str],
    spans: list[list[Span]] | list[list[SpanType]],
    infos: list[InfoDictType] | None = None,
    tokenizer: str
    | Tokenizer
    | PreTrainedTokenizer
    | PreTrainedTokenizerFast
    | None = None,
    tokenizer_kwargs: dict = {},
) -> LabeledDataset

Create a LabeledDataset from a set of texts and one or more spans for each text.

Parameters:

Name Type Description Default

texts

list[str]

The set of text.

required

spans

list[list[Span]] | list[list[dict[str, str | int | float | None]]]

A list of spans for each text.

required

infos

list[dict[str, str | int | float | bool]] | None

A list of dictionaries containing additional information for each entry. If None, no additional information is added. Defaults to None.

None

tokenizer

str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None

A Tokenizer used for tokenization. Supports initialization from a transformers.PreTrainedTokenizer, and uses whitespace tokenization by default.

None

tokenizer_kwargs

dict

Additional arguments for the tokenizer.

{}
Source code in labl/data/labeled_dataset.py
@classmethod
def from_spans(
    cls,
    texts: list[str],
    spans: list[list[Span]] | list[list[SpanType]],
    infos: list[InfoDictType] | None = None,
    tokenizer: str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None = None,
    tokenizer_kwargs: dict = {},
) -> "LabeledDataset":
    """Create a `LabeledDataset` from a set of texts and one or more spans for each text.

    Args:
        texts (list[str]):
            The set of text.
        spans (list[list[Span]] | list[list[dict[str, str | int | float | None]]]):
            A list of spans for each text.
        infos (list[dict[str, str | int | float | bool]] | None):
            A list of dictionaries containing additional information for each entry.
            If None, no additional information is added. Defaults to None.
        tokenizer (str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None): A `Tokenizer`
            used for tokenization. Supports initialization from a `transformers.PreTrainedTokenizer`, and uses
            whitespace tokenization by default.
        tokenizer_kwargs (dict): Additional arguments for the tokenizer.
    """
    tokenizer = get_tokenizer(tokenizer, tokenizer_kwargs)
    if infos is None:
        infos = [{}] * len(texts)
    return cls(
        [
            LabeledEntry.from_spans(
                text,
                span,
                tokenizer=tokenizer,
                info=info,
            )
            for text, span, info in tqdm(
                zip(texts, spans, infos, strict=True),
                desc="Creating labeled dataset",
                total=len(texts),
                unit="entries",
            )
        ]
    )

from_tagged classmethod

from_tagged(
    tagged: list[str],
    tokenizer: str
    | Tokenizer
    | PreTrainedTokenizer
    | PreTrainedTokenizerFast
    | None = None,
    keep_tags: list[str] = [],
    ignore_tags: list[str] = [],
    tokenizer_kwargs: dict = {},
    infos: list[InfoDictType] | None = None,
) -> LabeledDataset

Create a LabeledDataset from a set of tagged texts.

Parameters:

Name Type Description Default

tagged

list[str]

The set of tagged text.

required

tokenizer

str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None

A Tokenizer used for tokenization. Supports initialization from a transformers.PreTrainedTokenizer, and uses whitespace tokenization by default.

None

infos

list[dict[str, str | int | float | bool]] | None

A list of dictionaries containing additional information for each entry. If None, no additional information is added. Defaults to None.

None

keep_tags

list[str]

A list of tags to keep.

[]

ignore_tags

list[str]

A list of tags to ignore.

[]

tokenizer_kwargs

dict

Additional arguments for the tokenizer.

{}
Source code in labl/data/labeled_dataset.py
@classmethod
def from_tagged(
    cls,
    tagged: list[str],
    tokenizer: str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None = None,
    keep_tags: list[str] = [],
    ignore_tags: list[str] = [],
    tokenizer_kwargs: dict = {},
    infos: list[InfoDictType] | None = None,
) -> "LabeledDataset":
    """Create a `LabeledDataset` from a set of tagged texts.

    Args:
        tagged (list[str]):
            The set of tagged text.
        tokenizer (str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None): A `Tokenizer`
            used for tokenization. Supports initialization from a `transformers.PreTrainedTokenizer`, and uses
            whitespace tokenization by default.
        infos (list[dict[str, str | int | float | bool]] | None):
            A list of dictionaries containing additional information for each entry.
            If None, no additional information is added. Defaults to None.
        keep_tags (list[str]): A list of tags to keep.
        ignore_tags (list[str]): A list of tags to ignore.
        tokenizer_kwargs (dict): Additional arguments for the tokenizer.
    """
    tokenizer = get_tokenizer(tokenizer, tokenizer_kwargs)
    if infos is None:
        infos = [{}] * len(tagged)
    return cls(
        [
            LabeledEntry.from_tagged(
                text,
                tokenizer=tokenizer,
                keep_tags=keep_tags,
                ignore_tags=ignore_tags,
                info=info,
            )
            for text, info in tqdm(
                zip(tagged, infos, strict=True), desc="Creating labeled dataset", total=len(tagged), unit="entries"
            )
        ]
    )

from_tokens classmethod

from_tokens(
    tokens: list[list[str]],
    labels: Sequence[Sequence[LabelType]],
    infos: list[InfoDictType] | None = None,
    keep_labels: list[str] = [],
    ignore_labels: list[str] = [],
    tokenizer: str
    | Tokenizer
    | PreTrainedTokenizer
    | PreTrainedTokenizerFast
    | None = None,
    tokenizer_kwargs: dict = {},
) -> LabeledDataset

Create a LabeledDataset from a set of tokenized texts.

Parameters:

Name Type Description Default

tokens

list[list[str]] | None

A list of lists of string tokens.

required

labels

list[list[str | int | float | None]] | None

A list of lists of labels for the tokens.

required

infos

list[dict[str, str | int | float | bool]] | None

A list of dictionaries containing additional information for each entry. If None, no additional information is added. Defaults to None.

None

keep_labels

list[str]

A list of labels to keep.

[]

ignore_labels

list[str]

A list of labels to ignore.

[]

tokenizer

str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None

A Tokenizer used for tokenization. Supports initialization from a transformers.PreTrainedTokenizer, and uses whitespace tokenization by default.

None

tokenizer_kwargs

dict

Additional arguments for the tokenizer.

{}
Source code in labl/data/labeled_dataset.py
@classmethod
def from_tokens(
    cls,
    tokens: list[list[str]],
    labels: Sequence[Sequence[LabelType]],
    infos: list[InfoDictType] | None = None,
    keep_labels: list[str] = [],
    ignore_labels: list[str] = [],
    tokenizer: str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None = None,
    tokenizer_kwargs: dict = {},
) -> "LabeledDataset":
    """Create a `LabeledDataset` from a set of tokenized texts.

    Args:
        tokens (list[list[str]] | None):
            A list of lists of string tokens.
        labels (list[list[str | int | float | None]] | None):
            A list of lists of labels for the tokens.
        infos (list[dict[str, str | int | float | bool]] | None):
            A list of dictionaries containing additional information for each entry.
            If None, no additional information is added. Defaults to None.
        keep_labels (list[str]): A list of labels to keep.
        ignore_labels (list[str]): A list of labels to ignore.
        tokenizer (str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None): A `Tokenizer`
            used for tokenization. Supports initialization from a `transformers.PreTrainedTokenizer`, and uses
            whitespace tokenization by default.
        tokenizer_kwargs (dict): Additional arguments for the tokenizer.
    """
    tokenizer = get_tokenizer(tokenizer, tokenizer_kwargs)
    if infos is None:
        infos = [{}] * len(tokens)
    return cls(
        [
            LabeledEntry.from_tokens(
                tokens=tokens[idx],
                labels=labels[idx],
                keep_labels=keep_labels,
                ignore_labels=ignore_labels,
                tokenizer=tokenizer,
                info=infos[idx],
            )
            for idx in tqdm(range(len(tokens)), desc="Creating LabeledDataset", total=len(tokens), unit="entries")
        ]
    )

labl.data.edited_dataset.EditedDataset

EditedDataset(
    iterable: Iterable[LabeledObject] | None = None,
    *,
    info: InfoDictType = {},
)

Bases: BaseLabeledDataset[EditedEntry]

Dataset class for handling collections of EditedEntry and MultiEditEntry objects.

Attributes:

Name Type Description
data list[EditedEntry] | list[MultiEditEntry]

A list of EditedEntry or MultiEditEntry objects.

Source code in labl/data/base_sequence.py
def __init__(self, iterable: Iterable[LabeledObject] | None = None, *, info: InfoDictType = {}):
    if iterable is None:
        super().__init__()
    else:
        super().__init__(iterable)
    self._label_types = self._get_label_types()
    self._info = info

from_edits classmethod

from_edits(
    texts: list[str],
    edits: list[str] | list[list[str]],
    infos: list[InfoDictType]
    | list[list[InfoDictType]]
    | None = None,
    tokenizer: str
    | Tokenizer
    | PreTrainedTokenizer
    | PreTrainedTokenizerFast
    | None = None,
    tokenizer_kwargs: dict = {},
    with_gaps: bool = True,
    sub_label: str = "S",
    ins_label: str = "I",
    del_label: str = "D",
    gap_token: str = "▁",
) -> EditedDataset

Create an EditedDataset from a set of texts and one or more edits for each text.

Parameters:

Name Type Description Default

texts

list[str]

The set of text.

required

edits

list[str] | list[list[str]] | None

One or more edited version for each text.

required

infos

list[dict[str, str | int | float | bool]] | list[list[dict[str, str | int | float | bool]]] | None

A list of dictionaries containing additional information for each entry. If multiple edits are provided for each text, infos can be a list of lists of dictionaries (one per edit per entry). If None, no additional information is added. Defaults to None.

None

tokenizer

str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None

A Tokenizer used for tokenization. Supports initialization from a transformers.PreTrainedTokenizer, and uses whitespace tokenization by default.

None

tokenizer_kwargs

dict

Additional arguments for the tokenizer.

{}

with_gaps

bool

Whether to add gaps to the tokens and offsets. Gaps are used to mark the positions of insertions and deletions in the original/edited texts, respectively. If false, those are merged to the next token to the right. Default: True.

True

sub_label

str

The label for substitutions. Default: "S".

'S'

ins_label

str

The label for insertions. Default: "I".

'I'

del_label

str

The label for deletions. Default: "D".

'D'

gap_token

str

The token to use for gaps. Default: "▁".

'▁'
Source code in labl/data/edited_dataset.py
@classmethod
def from_edits(
    cls,
    texts: list[str],
    edits: list[str] | list[list[str]],
    infos: list[InfoDictType] | list[list[InfoDictType]] | None = None,
    tokenizer: str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None = None,
    tokenizer_kwargs: dict = {},
    with_gaps: bool = True,
    sub_label: str = "S",
    ins_label: str = "I",
    del_label: str = "D",
    gap_token: str = "▁",
) -> "EditedDataset":
    """Create an `EditedDataset` from a set of texts and one or more edits for each text.

    Args:
        texts (list[str]):
            The set of text.
        edits (list[str] | list[list[str]] | None):
            One or more edited version for each text.
        infos (list[dict[str, str | int | float | bool]] | list[list[dict[str, str | int | float | bool]]] | None):
            A list of dictionaries containing additional information for each entry.
            If multiple edits are provided for each text, `infos` can be a list of lists of dictionaries (one per
            edit per entry). If None, no additional information is added. Defaults to None.
        tokenizer (str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None): A `Tokenizer`
            used for tokenization. Supports initialization from a `transformers.PreTrainedTokenizer`, and uses
            whitespace tokenization by default.
        tokenizer_kwargs (dict): Additional arguments for the tokenizer.
        with_gaps (bool): Whether to add gaps to the tokens and offsets. Gaps are used to mark the positions of
            insertions and deletions in the original/edited texts, respectively. If false, those are merged to the
            next token to the right. Default: True.
        sub_label (str): The label for substitutions. Default: "S".
        ins_label (str): The label for insertions. Default: "I".
        del_label (str): The label for deletions. Default: "D".
        gap_token (str): The token to use for gaps. Default: "▁".
    """
    tokenizer = get_tokenizer(tokenizer, tokenizer_kwargs)
    if infos is None:
        infos = [{}] * len(texts)
    return cls(
        [
            EditedEntry.from_edits(
                text=text,
                edits=edit,
                tokenizer=tokenizer,
                with_gaps=with_gaps,
                sub_label=sub_label,
                ins_label=ins_label,
                del_label=del_label,
                gap_token=gap_token,
                info=info,
            )
            for text, edit, info in tqdm(
                zip(texts, edits, infos, strict=True),
                desc="Creating EditedDataset",
                total=len(texts),
                unit="entries",
            )
        ]
    )

from_edits_dataframe classmethod

from_edits_dataframe(
    df,
    text_column: str,
    edit_column: str,
    entry_ids: str | list[str],
    infos_columns: list[str] = [],
    tokenizer: str
    | Tokenizer
    | PreTrainedTokenizer
    | PreTrainedTokenizerFast
    | None = None,
    tokenizer_kwargs: dict[str, Any] = {},
    with_gaps: bool = True,
    sub_label: str = "S",
    ins_label: str = "I",
    del_label: str = "D",
    gap_token: str = "▁",
) -> EditedDataset

Create an EditedDataset from a pandas.DataFrame with edits.

Every row in the DataFrame is an entry identified univocally by entry_ids. The text_column contains the original text, and the edit_column contains the edits. If multiple columns with the same entry_ids are present, they are all treated as edits of the same text.

Parameters:

Name Type Description Default

df

DataFrame

The DataFrame containing the text and edits.

required

text_column

str

The name of the column in the dataframe containing the original text.

required

edit_column

str

The name of the column in the dataframe containing the edited text.

required

entry_ids

str | list[str]

One or more column names acting as unique identifiers for each entry. If multiple entries are found with the same entry_ids, they are all treated as edits of the same text.

required

infos_columns

list[str]

A list of columns containing additional information for each entry.

[]

tokenizer

str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None

A Tokenizer used for tokenization. Supports initialization from a transformers.PreTrainedTokenizer, and uses whitespace tokenization by default.

None

tokenizer_kwargs

dict[str, Any]

description. Defaults to {}.

{}

with_gaps

bool

Whether to add gaps to the tokens and offsets. Gaps are used to mark the positions of insertions and deletions in the original/edited texts, respectively. If false, those are merged to the next token to the right. Default: True.

True

sub_label

str

The label for substitutions. Default: "S".

'S'

ins_label

str

The label for insertions. Default: "I".

'I'

del_label

str

The label for deletions. Default: "D".

'D'

gap_token

str

The token to use for gaps. Default: "▁".

'▁'

Returns:

Type Description
EditedDataset

An EditedDataset initialized from the set of texts and edits.

Source code in labl/data/edited_dataset.py
@classmethod
def from_edits_dataframe(
    cls,
    df,
    text_column: str,
    edit_column: str,
    entry_ids: str | list[str],
    infos_columns: list[str] = [],
    tokenizer: str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None = None,
    tokenizer_kwargs: dict[str, Any] = {},
    with_gaps: bool = True,
    sub_label: str = "S",
    ins_label: str = "I",
    del_label: str = "D",
    gap_token: str = "▁",
) -> "EditedDataset":
    """Create an `EditedDataset` from a `pandas.DataFrame` with edits.

    Every row in the DataFrame is an entry identified univocally by `entry_ids`. The `text_column` contains the
    original text, and the `edit_column` contains the edits. If multiple columns with the same `entry_ids` are
    present, they are all treated as edits of the same text.

    Args:
        df (pandas.DataFrame): The DataFrame containing the text and edits.
        text_column (str): The name of the column in the dataframe containing the original text.
        edit_column (str): The name of the column in the dataframe containing the edited text.
        entry_ids (str | list[str]): One or more column names acting as unique identifiers for each entry. If
            multiple entries are found with the same `entry_ids`, they are all treated as edits of the same text.
        infos_columns (list[str]): A list of columns containing additional information for each entry.
        tokenizer (str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None, optional): A `Tokenizer`
            used for tokenization. Supports initialization from a `transformers.PreTrainedTokenizer`, and uses
            whitespace tokenization by default.
        tokenizer_kwargs (dict[str, Any], optional): _description_. Defaults to {}.
        with_gaps (bool): Whether to add gaps to the tokens and offsets. Gaps are used to mark the positions of
            insertions and deletions in the original/edited texts, respectively. If false, those are merged to the
            next token to the right. Default: True.
        sub_label (str): The label for substitutions. Default: "S".
        ins_label (str): The label for insertions. Default: "I".
        del_label (str): The label for deletions. Default: "D".
        gap_token (str): The token to use for gaps. Default: "▁".

    Returns:
        An `EditedDataset` initialized from the set of texts and edits.
    """
    if not is_pandas_available():
        raise ImportError("Pandas is not installed. Please install pandas to use this function.")
    import pandas as pd

    tokenizer = get_tokenizer(tokenizer, tokenizer_kwargs)
    df = cast(pd.DataFrame, df)
    if isinstance(entry_ids, str):
        entry_ids = [entry_ids]
    grouped_dfs = df.groupby(entry_ids).size().reset_index()
    all_texts = []
    all_edits = []
    all_infos = []
    for _, entry_row in tqdm(
        grouped_dfs.iterrows(), desc="Extracting texts and edits", total=len(grouped_dfs), unit="entries"
    ):
        curr_vals = [entry_row[col] for col in entry_ids]
        edit_rows = df[(df[entry_ids] == curr_vals).all(axis=1)]
        text = edit_rows[text_column].tolist()[0]
        edits = edit_rows[edit_column].tolist()
        all_texts.append(text)
        all_edits.append(edits)
        infos = []
        for _, edit_row in edit_rows.iterrows():
            infos.append({col: edit_row[col] for col in infos_columns})
        all_infos.append(infos)
    return EditedDataset.from_edits(
        all_texts,
        all_edits,
        all_infos,
        tokenizer=tokenizer,
        with_gaps=with_gaps,
        sub_label=sub_label,
        ins_label=ins_label,
        del_label=del_label,
        gap_token=gap_token,
    )

merge_gap_annotations

merge_gap_annotations(
    merge_fn: Callable[[Sequence[LabelType]], LabelType]
    | None = None,
    keep_final_gap: bool = True,
) -> None

Merge gap annotations in the tokens of orig and edit.

This method is equivalent to calling EditedEntry.from_edits with with_gaps=False. Gap annotations are merged to the next non-gap token to the right, and the gap label is added to the label of the non-gap token. The last gap is kept to account for insertions at the end of the text.

E.g. GAP Hello GAP World GAP ! GAP becomes Hello World ! GAP. I S I I IS I I

Source code in labl/data/edited_dataset.py
def merge_gap_annotations(
    self,
    merge_fn: Callable[[Sequence[LabelType]], LabelType] | None = None,
    keep_final_gap: bool = True,
) -> None:
    """Merge gap annotations in the tokens of `orig` and `edit`.

    This method is equivalent to calling `EditedEntry.from_edits` with `with_gaps=False`. Gap annotations are merged
    to the next non-gap token to the right, and the gap label is added to the label of the non-gap token. The last
    gap is kept to account for insertions at the end of the text.

    E.g. `GAP Hello GAP World GAP ! GAP` becomes `Hello World ! GAP`.
         `  I     S   I               I`         `   IS     I     I`
    """
    for entry in self:
        cast(EditedEntry | MultiEditEntry, entry).merge_gap_annotations(
            merge_fn=merge_fn, keep_final_gap=keep_final_gap
        )