Dataset¶

labl.data.base_sequence.BaseLabeledDataset ¶

BaseLabeledDataset(
    iterable: Iterable[LabeledObject] | None = None,
    *,
    info: InfoDictType = {},
)

Bases: BaseLabeledSequence[EntryType | BaseMultiLabelEntry[EntryType]], ABC

Base class for all dataset classes containing BaseLabeledEntry objects.

Source code in labl/data/base_sequence.py

def __init__(self, iterable: Iterable[LabeledObject] | None = None, *, info: InfoDictType = {}):
    if iterable is None:
        super().__init__()
    else:
        super().__init__(iterable)
    self._label_types = self._get_label_types()
    self._info = info

get_agreement ¶

get_agreement(
    other: BaseLabeledSequence[
        EntryType | BaseMultiLabelEntry[EntryType]
    ]
    | None = None,
    level_of_measurement: LevelOfMeasurement | None = None,
) -> AgreementOutput

Compute the inter-annotator agreement for the token labels of all label sets using Krippendorff's alpha.

Source code in labl/data/base_sequence.py

def get_agreement(
    self,
    other: BaseLabeledSequence[EntryType | BaseMultiLabelEntry[EntryType]] | None = None,
    level_of_measurement: LevelOfMeasurement | None = None,
) -> AgreementOutput:
    """Compute the inter-annotator agreement for the token labels of all label sets using
    [Krippendorff's alpha](https://en.wikipedia.org/wiki/Krippendorff%27s_alpha).
    """
    self._validate_single_label_type()
    if other is not None:
        other._validate_single_label_type()
        if self.label_types[0] != other.label_types[0]:
            raise RuntimeError(
                f"Label type does not match: {self.label_types[0]} vs {other.label_types[0]}.\n"
                "Transform the annotations using `.relabel` to ensure a single type is present."
            )
    labels_array = self._get_labels_array(other=other, dtype=self.label_types[0])
    return get_labels_agreement(
        label_type=self.label_types[0],
        labels_array=labels_array,
        level_of_measurement=level_of_measurement,
    )

labl.data.labeled_dataset.LabeledDataset ¶

LabeledDataset(
    iterable: Iterable[LabeledObject] | None = None,
    *,
    info: InfoDictType = {},
)

Bases: BaseLabeledDataset[LabeledEntry]

Dataset class for handling collections of LabeledEntry objects.

Attributes:

Name	Type	Description
`data`	`list[LabeledEntry]`	A list of LabeledEntry objects.

Source code in labl/data/base_sequence.py

def __init__(self, iterable: Iterable[LabeledObject] | None = None, *, info: InfoDictType = {}):
    if iterable is None:
        super().__init__()
    else:
        super().__init__(iterable)
    self._label_types = self._get_label_types()
    self._info = info

from_spans `classmethod` ¶

from_spans(
    texts: list[str],
    spans: list[list[Span]] | list[list[SpanType]],
    infos: list[InfoDictType] | None = None,
    tokenizer: str
    | Tokenizer
    | PreTrainedTokenizer
    | PreTrainedTokenizerFast
    | None = None,
    tokenizer_kwargs: dict = {},
) -> LabeledDataset

Create a LabeledDataset from a set of texts and one or more spans for each text.

Parameters:

Name	Type	Description	Default
`texts` ¶	`list[str]`	The set of text.	required
`spans` ¶	`list[list[Span]] \| list[list[dict[str, str \| int \| float \| None]]]`	A list of spans for each text.	required
`infos` ¶	`list[dict[str, str \| int \| float \| bool]] \| None`	A list of dictionaries containing additional information for each entry. If None, no additional information is added. Defaults to None.	`None`
`tokenizer` ¶	`str \| Tokenizer \| PreTrainedTokenizer \| PreTrainedTokenizerFast \| None`	A `Tokenizer` used for tokenization. Supports initialization from a `transformers.PreTrainedTokenizer`, and uses whitespace tokenization by default.	`None`
`tokenizer_kwargs` ¶	`dict`	Additional arguments for the tokenizer.	`{}`

Source code in labl/data/labeled_dataset.py

@classmethod
def from_spans(
    cls,
    texts: list[str],
    spans: list[list[Span]] | list[list[SpanType]],
    infos: list[InfoDictType] | None = None,
    tokenizer: str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None = None,
    tokenizer_kwargs: dict = {},
) -> "LabeledDataset":
    """Create a `LabeledDataset` from a set of texts and one or more spans for each text.

    Args:
        texts (list[str]):
            The set of text.
        spans (list[list[Span]] | list[list[dict[str, str | int | float | None]]]):
            A list of spans for each text.
        infos (list[dict[str, str | int | float | bool]] | None):
            A list of dictionaries containing additional information for each entry.
            If None, no additional information is added. Defaults to None.
        tokenizer (str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None): A `Tokenizer`
            used for tokenization. Supports initialization from a `transformers.PreTrainedTokenizer`, and uses
            whitespace tokenization by default.
        tokenizer_kwargs (dict): Additional arguments for the tokenizer.
    """
    tokenizer = get_tokenizer(tokenizer, tokenizer_kwargs)
    if infos is None:
        infos = [{}] * len(texts)
    return cls(
        [
            LabeledEntry.from_spans(
                text,
                span,
                tokenizer=tokenizer,
                info=info,
            )
            for text, span, info in tqdm(
                zip(texts, spans, infos, strict=True),
                desc="Creating labeled dataset",
                total=len(texts),
                unit="entries",
            )
        ]
    )

from_tagged `classmethod` ¶

from_tagged(
    tagged: list[str],
    tokenizer: str
    | Tokenizer
    | PreTrainedTokenizer
    | PreTrainedTokenizerFast
    | None = None,
    keep_tags: list[str] = [],
    ignore_tags: list[str] = [],
    tokenizer_kwargs: dict = {},
    infos: list[InfoDictType] | None = None,
) -> LabeledDataset

Create a LabeledDataset from a set of tagged texts.

Parameters:

Name	Type	Description	Default
`tagged` ¶	`list[str]`	The set of tagged text.	required
`tokenizer` ¶	`str \| Tokenizer \| PreTrainedTokenizer \| PreTrainedTokenizerFast \| None`	A `Tokenizer` used for tokenization. Supports initialization from a `transformers.PreTrainedTokenizer`, and uses whitespace tokenization by default.	`None`
`infos` ¶	`list[dict[str, str \| int \| float \| bool]] \| None`	A list of dictionaries containing additional information for each entry. If None, no additional information is added. Defaults to None.	`None`
`keep_tags` ¶	`list[str]`	A list of tags to keep.	`[]`
`ignore_tags` ¶	`list[str]`	A list of tags to ignore.	`[]`
`tokenizer_kwargs` ¶	`dict`	Additional arguments for the tokenizer.	`{}`

Source code in labl/data/labeled_dataset.py

@classmethod
def from_tagged(
    cls,
    tagged: list[str],
    tokenizer: str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None = None,
    keep_tags: list[str] = [],
    ignore_tags: list[str] = [],
    tokenizer_kwargs: dict = {},
    infos: list[InfoDictType] | None = None,
) -> "LabeledDataset":
    """Create a `LabeledDataset` from a set of tagged texts.

    Args:
        tagged (list[str]):
            The set of tagged text.
        tokenizer (str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None): A `Tokenizer`
            used for tokenization. Supports initialization from a `transformers.PreTrainedTokenizer`, and uses
            whitespace tokenization by default.
        infos (list[dict[str, str | int | float | bool]] | None):
            A list of dictionaries containing additional information for each entry.
            If None, no additional information is added. Defaults to None.
        keep_tags (list[str]): A list of tags to keep.
        ignore_tags (list[str]): A list of tags to ignore.
        tokenizer_kwargs (dict): Additional arguments for the tokenizer.
    """
    tokenizer = get_tokenizer(tokenizer, tokenizer_kwargs)
    if infos is None:
        infos = [{}] * len(tagged)
    return cls(
        [
            LabeledEntry.from_tagged(
                text,
                tokenizer=tokenizer,
                keep_tags=keep_tags,
                ignore_tags=ignore_tags,
                info=info,
            )
            for text, info in tqdm(
                zip(tagged, infos, strict=True), desc="Creating labeled dataset", total=len(tagged), unit="entries"
            )
        ]
    )

from_tokens `classmethod` ¶

from_tokens(
    tokens: list[list[str]],
    labels: Sequence[Sequence[LabelType]],
    infos: list[InfoDictType] | None = None,
    keep_labels: list[str] = [],
    ignore_labels: list[str] = [],
    tokenizer: str
    | Tokenizer
    | PreTrainedTokenizer
    | PreTrainedTokenizerFast
    | None = None,
    tokenizer_kwargs: dict = {},
) -> LabeledDataset

Create a LabeledDataset from a set of tokenized texts.

Parameters:

Name	Type	Description	Default
`tokens` ¶	`list[list[str]] \| None`	A list of lists of string tokens.	required
`labels` ¶	`list[list[str \| int \| float \| None]] \| None`	A list of lists of labels for the tokens.	required
`infos` ¶	`list[dict[str, str \| int \| float \| bool]] \| None`	A list of dictionaries containing additional information for each entry. If None, no additional information is added. Defaults to None.	`None`
`keep_labels` ¶	`list[str]`	A list of labels to keep.	`[]`
`ignore_labels` ¶	`list[str]`	A list of labels to ignore.	`[]`
`tokenizer` ¶	`str \| Tokenizer \| PreTrainedTokenizer \| PreTrainedTokenizerFast \| None`	A `Tokenizer` used for tokenization. Supports initialization from a `transformers.PreTrainedTokenizer`, and uses whitespace tokenization by default.	`None`
`tokenizer_kwargs` ¶	`dict`	Additional arguments for the tokenizer.	`{}`

Source code in labl/data/labeled_dataset.py

@classmethod
def from_tokens(
    cls,
    tokens: list[list[str]],
    labels: Sequence[Sequence[LabelType]],
    infos: list[InfoDictType] | None = None,
    keep_labels: list[str] = [],
    ignore_labels: list[str] = [],
    tokenizer: str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None = None,
    tokenizer_kwargs: dict = {},
) -> "LabeledDataset":
    """Create a `LabeledDataset` from a set of tokenized texts.

    Args:
        tokens (list[list[str]] | None):
            A list of lists of string tokens.
        labels (list[list[str | int | float | None]] | None):
            A list of lists of labels for the tokens.
        infos (list[dict[str, str | int | float | bool]] | None):
            A list of dictionaries containing additional information for each entry.
            If None, no additional information is added. Defaults to None.
        keep_labels (list[str]): A list of labels to keep.
        ignore_labels (list[str]): A list of labels to ignore.
        tokenizer (str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None): A `Tokenizer`
            used for tokenization. Supports initialization from a `transformers.PreTrainedTokenizer`, and uses
            whitespace tokenization by default.
        tokenizer_kwargs (dict): Additional arguments for the tokenizer.
    """
    tokenizer = get_tokenizer(tokenizer, tokenizer_kwargs)
    if infos is None:
        infos = [{}] * len(tokens)
    return cls(
        [
            LabeledEntry.from_tokens(
                tokens=tokens[idx],
                labels=labels[idx],
                keep_labels=keep_labels,
                ignore_labels=ignore_labels,
                tokenizer=tokenizer,
                info=infos[idx],
            )
            for idx in tqdm(range(len(tokens)), desc="Creating LabeledDataset", total=len(tokens), unit="entries")
        ]
    )

labl.data.edited_dataset.EditedDataset ¶

EditedDataset(
    iterable: Iterable[LabeledObject] | None = None,
    *,
    info: InfoDictType = {},
)

Bases: BaseLabeledDataset[EditedEntry]

Dataset class for handling collections of EditedEntry and MultiEditEntry objects.

Attributes:

Name	Type	Description
`data`	`list[EditedEntry] \| list[MultiEditEntry]`	A list of `EditedEntry` or `MultiEditEntry` objects.

Source code in labl/data/base_sequence.py

def __init__(self, iterable: Iterable[LabeledObject] | None = None, *, info: InfoDictType = {}):
    if iterable is None:
        super().__init__()
    else:
        super().__init__(iterable)
    self._label_types = self._get_label_types()
    self._info = info

from_edits `classmethod` ¶

from_edits(
    texts: list[str],
    edits: list[str] | list[list[str]],
    infos: list[InfoDictType]
    | list[list[InfoDictType]]
    | None = None,
    tokenizer: str
    | Tokenizer
    | PreTrainedTokenizer
    | PreTrainedTokenizerFast
    | None = None,
    tokenizer_kwargs: dict = {},
    with_gaps: bool = True,
    sub_label: str = "S",
    ins_label: str = "I",
    del_label: str = "D",
    gap_token: str = "▁",
) -> EditedDataset

Create an EditedDataset from a set of texts and one or more edits for each text.

Parameters:

Name	Type	Description	Default
`texts` ¶	`list[str]`	The set of text.	required
`edits` ¶	`list[str] \| list[list[str]] \| None`	One or more edited version for each text.	required
`infos` ¶	`list[dict[str, str \| int \| float \| bool]] \| list[list[dict[str, str \| int \| float \| bool]]] \| None`	A list of dictionaries containing additional information for each entry. If multiple edits are provided for each text, `infos` can be a list of lists of dictionaries (one per edit per entry). If None, no additional information is added. Defaults to None.	`None`
`tokenizer` ¶	`str \| Tokenizer \| PreTrainedTokenizer \| PreTrainedTokenizerFast \| None`	A `Tokenizer` used for tokenization. Supports initialization from a `transformers.PreTrainedTokenizer`, and uses whitespace tokenization by default.	`None`
`tokenizer_kwargs` ¶	`dict`	Additional arguments for the tokenizer.	`{}`
`with_gaps` ¶	`bool`	Whether to add gaps to the tokens and offsets. Gaps are used to mark the positions of insertions and deletions in the original/edited texts, respectively. If false, those are merged to the next token to the right. Default: True.	`True`
`sub_label` ¶	`str`	The label for substitutions. Default: "S".	`'S'`
`ins_label` ¶	`str`	The label for insertions. Default: "I".	`'I'`
`del_label` ¶	`str`	The label for deletions. Default: "D".	`'D'`
`gap_token` ¶	`str`	The token to use for gaps. Default: "▁".	`'▁'`

Source code in labl/data/edited_dataset.py

@classmethod
def from_edits(
    cls,
    texts: list[str],
    edits: list[str] | list[list[str]],
    infos: list[InfoDictType] | list[list[InfoDictType]] | None = None,
    tokenizer: str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None = None,
    tokenizer_kwargs: dict = {},
    with_gaps: bool = True,
    sub_label: str = "S",
    ins_label: str = "I",
    del_label: str = "D",
    gap_token: str = "▁",
) -> "EditedDataset":
    """Create an `EditedDataset` from a set of texts and one or more edits for each text.

    Args:
        texts (list[str]):
            The set of text.
        edits (list[str] | list[list[str]] | None):
            One or more edited version for each text.
        infos (list[dict[str, str | int | float | bool]] | list[list[dict[str, str | int | float | bool]]] | None):
            A list of dictionaries containing additional information for each entry.
            If multiple edits are provided for each text, `infos` can be a list of lists of dictionaries (one per
            edit per entry). If None, no additional information is added. Defaults to None.
        tokenizer (str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None): A `Tokenizer`
            used for tokenization. Supports initialization from a `transformers.PreTrainedTokenizer`, and uses
            whitespace tokenization by default.
        tokenizer_kwargs (dict): Additional arguments for the tokenizer.
        with_gaps (bool): Whether to add gaps to the tokens and offsets. Gaps are used to mark the positions of
            insertions and deletions in the original/edited texts, respectively. If false, those are merged to the
            next token to the right. Default: True.
        sub_label (str): The label for substitutions. Default: "S".
        ins_label (str): The label for insertions. Default: "I".
        del_label (str): The label for deletions. Default: "D".
        gap_token (str): The token to use for gaps. Default: "▁".
    """
    tokenizer = get_tokenizer(tokenizer, tokenizer_kwargs)
    if infos is None:
        infos = [{}] * len(texts)
    return cls(
        [
            EditedEntry.from_edits(
                text=text,
                edits=edit,
                tokenizer=tokenizer,
                with_gaps=with_gaps,
                sub_label=sub_label,
                ins_label=ins_label,
                del_label=del_label,
                gap_token=gap_token,
                info=info,
            )
            for text, edit, info in tqdm(
                zip(texts, edits, infos, strict=True),
                desc="Creating EditedDataset",
                total=len(texts),
                unit="entries",
            )
        ]
    )

from_edits_dataframe `classmethod` ¶

from_edits_dataframe(
    df,
    text_column: str,
    edit_column: str,
    entry_ids: str | list[str],
    infos_columns: list[str] = [],
    tokenizer: str
    | Tokenizer
    | PreTrainedTokenizer
    | PreTrainedTokenizerFast
    | None = None,
    tokenizer_kwargs: dict[str, Any] = {},
    with_gaps: bool = True,
    sub_label: str = "S",
    ins_label: str = "I",
    del_label: str = "D",
    gap_token: str = "▁",
) -> EditedDataset

Create an EditedDataset from a pandas.DataFrame with edits.

Every row in the DataFrame is an entry identified univocally by entry_ids. The text_column contains the original text, and the edit_column contains the edits. If multiple columns with the same entry_ids are present, they are all treated as edits of the same text.

Parameters:

Name	Type	Description	Default
`df` ¶	`DataFrame`	The DataFrame containing the text and edits.	required
`text_column` ¶	`str`	The name of the column in the dataframe containing the original text.	required
`edit_column` ¶	`str`	The name of the column in the dataframe containing the edited text.	required
`entry_ids` ¶	`str \| list[str]`	One or more column names acting as unique identifiers for each entry. If multiple entries are found with the same `entry_ids`, they are all treated as edits of the same text.	required
`infos_columns` ¶	`list[str]`	A list of columns containing additional information for each entry.	`[]`
`tokenizer` ¶	`str \| Tokenizer \| PreTrainedTokenizer \| PreTrainedTokenizerFast \| None`	A `Tokenizer` used for tokenization. Supports initialization from a `transformers.PreTrainedTokenizer`, and uses whitespace tokenization by default.	`None`
`tokenizer_kwargs` ¶	`dict[str, Any]`	description. Defaults to {}.	`{}`
`with_gaps` ¶	`bool`	Whether to add gaps to the tokens and offsets. Gaps are used to mark the positions of insertions and deletions in the original/edited texts, respectively. If false, those are merged to the next token to the right. Default: True.	`True`
`sub_label` ¶	`str`	The label for substitutions. Default: "S".	`'S'`
`ins_label` ¶	`str`	The label for insertions. Default: "I".	`'I'`
`del_label` ¶	`str`	The label for deletions. Default: "D".	`'D'`
`gap_token` ¶	`str`	The token to use for gaps. Default: "▁".	`'▁'`

Returns:

Type	Description
`EditedDataset`	An `EditedDataset` initialized from the set of texts and edits.

Source code in labl/data/edited_dataset.py

@classmethod
def from_edits_dataframe(
    cls,
    df,
    text_column: str,
    edit_column: str,
    entry_ids: str | list[str],
    infos_columns: list[str] = [],
    tokenizer: str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None = None,
    tokenizer_kwargs: dict[str, Any] = {},
    with_gaps: bool = True,
    sub_label: str = "S",
    ins_label: str = "I",
    del_label: str = "D",
    gap_token: str = "▁",
) -> "EditedDataset":
    """Create an `EditedDataset` from a `pandas.DataFrame` with edits.

    Every row in the DataFrame is an entry identified univocally by `entry_ids`. The `text_column` contains the
    original text, and the `edit_column` contains the edits. If multiple columns with the same `entry_ids` are
    present, they are all treated as edits of the same text.

    Args:
        df (pandas.DataFrame): The DataFrame containing the text and edits.
        text_column (str): The name of the column in the dataframe containing the original text.
        edit_column (str): The name of the column in the dataframe containing the edited text.
        entry_ids (str | list[str]): One or more column names acting as unique identifiers for each entry. If
            multiple entries are found with the same `entry_ids`, they are all treated as edits of the same text.
        infos_columns (list[str]): A list of columns containing additional information for each entry.
        tokenizer (str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None, optional): A `Tokenizer`
            used for tokenization. Supports initialization from a `transformers.PreTrainedTokenizer`, and uses
            whitespace tokenization by default.
        tokenizer_kwargs (dict[str, Any], optional): _description_. Defaults to {}.
        with_gaps (bool): Whether to add gaps to the tokens and offsets. Gaps are used to mark the positions of
            insertions and deletions in the original/edited texts, respectively. If false, those are merged to the
            next token to the right. Default: True.
        sub_label (str): The label for substitutions. Default: "S".
        ins_label (str): The label for insertions. Default: "I".
        del_label (str): The label for deletions. Default: "D".
        gap_token (str): The token to use for gaps. Default: "▁".

    Returns:
        An `EditedDataset` initialized from the set of texts and edits.
    """
    if not is_pandas_available():
        raise ImportError("Pandas is not installed. Please install pandas to use this function.")
    import pandas as pd

    tokenizer = get_tokenizer(tokenizer, tokenizer_kwargs)
    df = cast(pd.DataFrame, df)
    if isinstance(entry_ids, str):
        entry_ids = [entry_ids]
    grouped_dfs = df.groupby(entry_ids).size().reset_index()
    all_texts = []
    all_edits = []
    all_infos = []
    for _, entry_row in tqdm(
        grouped_dfs.iterrows(), desc="Extracting texts and edits", total=len(grouped_dfs), unit="entries"
    ):
        curr_vals = [entry_row[col] for col in entry_ids]
        edit_rows = df[(df[entry_ids] == curr_vals).all(axis=1)]
        text = edit_rows[text_column].tolist()[0]
        edits = edit_rows[edit_column].tolist()
        all_texts.append(text)
        all_edits.append(edits)
        infos = []
        for _, edit_row in edit_rows.iterrows():
            infos.append({col: edit_row[col] for col in infos_columns})
        all_infos.append(infos)
    return EditedDataset.from_edits(
        all_texts,
        all_edits,
        all_infos,
        tokenizer=tokenizer,
        with_gaps=with_gaps,
        sub_label=sub_label,
        ins_label=ins_label,
        del_label=del_label,
        gap_token=gap_token,
    )

merge_gap_annotations ¶

merge_gap_annotations(
    merge_fn: Callable[[Sequence[LabelType]], LabelType]
    | None = None,
    keep_final_gap: bool = True,
) -> None

Merge gap annotations in the tokens of orig and edit.

This method is equivalent to calling EditedEntry.from_edits with with_gaps=False. Gap annotations are merged to the next non-gap token to the right, and the gap label is added to the label of the non-gap token. The last gap is kept to account for insertions at the end of the text.

E.g. GAP Hello GAP World GAP ! GAP becomes Hello World ! GAP. I S I I IS I I

Source code in labl/data/edited_dataset.py

def merge_gap_annotations(
    self,
    merge_fn: Callable[[Sequence[LabelType]], LabelType] | None = None,
    keep_final_gap: bool = True,
) -> None:
    """Merge gap annotations in the tokens of `orig` and `edit`.

    This method is equivalent to calling `EditedEntry.from_edits` with `with_gaps=False`. Gap annotations are merged
    to the next non-gap token to the right, and the gap label is added to the label of the non-gap token. The last
    gap is kept to account for insertions at the end of the text.

    E.g. `GAP Hello GAP World GAP ! GAP` becomes `Hello World ! GAP`.
         `  I     S   I               I`         `   IS     I     I`
    """
    for entry in self:
        cast(EditedEntry | MultiEditEntry, entry).merge_gap_annotations(
            merge_fn=merge_fn, keep_final_gap=keep_final_gap
        )

Dataset¶

labl.data.base_sequence.BaseLabeledDataset ¶

get_agreement ¶

labl.data.labeled_dataset.LabeledDataset ¶

from_spans classmethod ¶

texts ¶

spans ¶

infos ¶

tokenizer ¶

tokenizer_kwargs ¶

from_tagged classmethod ¶

tagged ¶

tokenizer ¶

infos ¶

keep_tags ¶

ignore_tags ¶

tokenizer_kwargs ¶

from_tokens classmethod ¶

tokens ¶

labels ¶

infos ¶

keep_labels ¶

ignore_labels ¶

tokenizer ¶

tokenizer_kwargs ¶

labl.data.edited_dataset.EditedDataset ¶

from_edits classmethod ¶

texts ¶

edits ¶

infos ¶

tokenizer ¶

tokenizer_kwargs ¶

with_gaps ¶

sub_label ¶

ins_label ¶

del_label ¶

gap_token ¶

from_edits_dataframe classmethod ¶

df ¶

text_column ¶

edit_column ¶

entry_ids ¶

infos_columns ¶

tokenizer ¶

tokenizer_kwargs ¶

with_gaps ¶

sub_label ¶

ins_label ¶

del_label ¶

gap_token ¶

merge_gap_annotations ¶

from_spans `classmethod` ¶

`texts` ¶

`spans` ¶

`infos` ¶

`tokenizer` ¶

`tokenizer_kwargs` ¶

from_tagged `classmethod` ¶

`tagged` ¶

`tokenizer` ¶

`infos` ¶

`keep_tags` ¶

`ignore_tags` ¶

`tokenizer_kwargs` ¶

from_tokens `classmethod` ¶

`tokens` ¶

`labels` ¶

`infos` ¶

`keep_labels` ¶

`ignore_labels` ¶

`tokenizer` ¶

`tokenizer_kwargs` ¶

from_edits `classmethod` ¶

`texts` ¶

`edits` ¶

`infos` ¶

`tokenizer` ¶

`tokenizer_kwargs` ¶

`with_gaps` ¶

`sub_label` ¶

`ins_label` ¶

`del_label` ¶

`gap_token` ¶

from_edits_dataframe `classmethod` ¶

`df` ¶

`text_column` ¶

`edit_column` ¶

`entry_ids` ¶

`infos_columns` ¶

`tokenizer` ¶

`tokenizer_kwargs` ¶

`with_gaps` ¶

`sub_label` ¶

`ins_label` ¶

`del_label` ¶

`gap_token` ¶