Skip to content

Entry

labl.data.base_entry.BaseLabeledEntry

Bases: LabeledInterface, ABC

Base class for all data entries. This class handles the creation of public getters, disallowing setting and providing a private constructor key to prevent direct instantiation.

relabel

relabel(
    relabel_fn: Callable[[LabelType], LabelType]
    | None = None,
    relabel_map: dict[str | int, LabelType] | None = None,
) -> None

Relabels the entry in-place using a custom relabeling function or a mapping.

Parameters:

Name Type Description Default

relabel_fn

Callable[[str | int | float | None], str | int | float | None]

A function that will be applied to each label in the entry. The function should take a single argument (the label) and return the new label. The function should return the label without any processing if the label should be preserved.

None

relabel_map

dict[str | int, str | int | float | None]

A dictionary that maps old labels to new labels. The keys are the old labels and the values are the new labels. This can be used instead of the relabel_fn to relabel the entry if labels are discrete.

None
Source code in labl/data/base_entry.py
def relabel(
    self,
    relabel_fn: Callable[[LabelType], LabelType] | None = None,
    relabel_map: dict[str | int, LabelType] | None = None,
) -> None:
    """Relabels the entry in-place using a custom relabeling function or a mapping.

    Args:
        relabel_fn (Callable[[str | int | float | None], str | int | float | None]):
            A function that will be applied to each label in the entry.
            The function should take a single argument (the label) and return the new label.
            The function should return the label without any processing if the label should be preserved.
        relabel_map (dict[str | int, str | int | float | None]):
            A dictionary that maps old labels to new labels. The keys are the old labels and the values are the
            new labels. This can be used instead of the relabel_fn to relabel the entry if labels are discrete.
    """
    if relabel_fn is None:
        if relabel_map is None:
            raise ValueError("Either relabel_fn or relabel_map must be provided.")
        relabel_fn = lambda x: x if x is None or isinstance(x, float) else relabel_map.get(x, x)
    self._relabel_attributes(relabel_fn)
    self._label_types = self._get_label_types()

get_agreement

get_agreement(
    other: EntryType,
    level_of_measurement: LevelOfMeasurement | None = None,
) -> AgreementOutput

Compute the inter-annotator agreement for the token labels of two entries using Krippendorff's alpha.

Source code in labl/data/base_entry.py
def get_agreement(
    self: EntryType,
    other: EntryType,
    level_of_measurement: LevelOfMeasurement | None = None,
) -> AgreementOutput:
    """Compute the inter-annotator agreement for the token labels of two entries using
    [Krippendorff's alpha](https://en.wikipedia.org/wiki/Krippendorff%27s_alpha).
    """
    self._validate_single_label_type()
    other._validate_single_label_type()
    if self.label_types[0] != other.label_types[0]:
        raise RuntimeError(
            f"Label type does not match: {self.label_types[0]} vs {other.label_types[0]}.\n"
            "Transform the annotations using `.relabel` to ensure a single type is present."
        )
    labels_array = self._get_labels_array([self, other], dtype=self.label_types[0])
    return get_labels_agreement(
        label_type=self.label_types[0],
        labels_array=labels_array,
        level_of_measurement=level_of_measurement,
    )

labl.data.labeled_entry.LabeledEntry

LabeledEntry(
    text: str,
    spans: SpanList,
    tagged: str,
    tokens: list[str],
    tokens_labels: Sequence[LabelType],
    tokens_offsets: list[OffsetType],
    info: InfoDictType = {},
    constructor_key: object | None = None,
)

Bases: BaseLabeledEntry

Class for a text entry with a set of granular annotations over some of its parts.

The class provides a centralized object to easily switch between different annotation formats:

  • The original text without labels.
  • A tagged version with ... labels.
  • A list of spans corresponding to tagged substrings, with start/end indices.
  • A tokenized version of the text with labels associated to each token.

Attributes:

Name Type Description
text str

The original text.

spans list[Span]

A list of lists of Span items containing the start/end indices of the span, the contained text and a label associated to it.

tagged str

A tagged versions of text containing tags like <tag>...</tag> to mark labels from spans or tokens.

tokens list[str]

A list of strings representing the tokenized version of text.

tokens_labels list[str | int | float | None]

A list of the same length as tokens containing labels associated with every token. Labels can be strings, numbers or None.

labeled_tokens list[list[LabeledToken]]

A list of LabeledToken objects joining tokens and tokens_labels.

tokens_offsets list[tuple[int, int] | None]

Offsets for each token in tokens. The i-th element corresponds to the i-th token in tokens. The offsets are tuples of the form (start, end) corresponding to start and end positions of the token in text. If the token does not exist in text, the offset is None.

label_types list[type]

A list of the types of labels for the entry.

info dict[str, str | int | float | bool]

A dictionary containing additional information about the entry.

A LabeledEntry can be initialized from:

  • A tagged text, e.g. Hello <error>world</error>!, using LabeledEntry.from_tagged(tagged=...).

  • A text and a list of labeled spans, e.g. Hello world! and [{'start': 0, 'end': 5, 'label': 'error'}], using LabeledEntry.from_spans(text=..., spans=...).

  • A list of labeled_tokens with string/numeric labels, e.g. [('Hel', 0.5), ('lo', 0.7), ('world', 1), ('!', 0)], or two separate lists of tokens and labels using LabeledEntry.from_tokens(labeled_tokens=...) or LabeledEntry.from_tokens(tokens=..., labels=).

Source code in labl/data/labeled_entry.py
def __init__(
    self,
    text: str,
    spans: SpanList,
    tagged: str,
    tokens: list[str],
    tokens_labels: Sequence[LabelType],
    tokens_offsets: list[OffsetType],
    info: InfoDictType = {},
    constructor_key: object | None = None,
):
    """Private constructor for `LabeledEntry`.

    A `LabeledEntry` can be initialized from:

    * A `tagged` text, e.g. `Hello <error>world</error>!`, using `LabeledEntry.from_tagged(tagged=...)`.

    * A `text` and a list of labeled `spans`, e.g. `Hello world!` and `[{'start': 0, 'end': 5, 'label': 'error'}]`,
        using `LabeledEntry.from_spans(text=..., spans=...)`.

    * A list of `labeled_tokens` with string/numeric labels, e.g. `[('Hel', 0.5), ('lo', 0.7), ('world', 1),
        ('!', 0)]`, or two separate lists of `tokens` and `labels` using `LabeledEntry.from_tokens(labeled_tokens=...)`
        or `LabeledEntry.from_tokens(tokens=..., labels=)`.
    """
    if constructor_key != self.__constructor_key:
        raise RuntimeError(
            dedent("""\
            The default constructor for `LabeledEntry` is private. A `LabeledEntry` can be initialized from:

            * A `tagged` text, e.g. `Hello <error>world</error>!`, using `LabeledEntry.from_tagged(tagged=...)`.

            * A `text` and a list of labeled `spans`, e.g. `Hello world!` and `[{'start': 0, 'end': 5, 'label': 'error'}]`,
                using `LabeledEntry.from_spans(text=..., spans=...)`.

            * A list of `labeled_tokens` with string/numeric labels, e.g. `[('Hel', 0.5), ('lo', 0.7), ('world', 1),
                ('!', 0)]`, or two separate lists of `tokens` and `labels` using `LabeledEntry.from_tokens(labeled_tokens=...)`
                or `LabeledEntry.from_tokens(tokens=..., labels=)`.
            """)
        )
    self._text = text
    self._spans = spans
    self._tagged = tagged
    self._tokens = tokens
    self._tokens_labels = tokens_labels
    self._tokens_offsets = tokens_offsets
    self._info = info
    self._label_types = self._get_label_types()

text property writable

text: str

The input text. This is a read-only property.

spans property writable

spans: SpanList

Labeled spans of the text. This is a read-only property.

tagged property writable

tagged: str

The tagged version of the text. This is a read-only property.

tokens property writable

tokens: list[str]

The tokenized version of the text. This is a read-only property.

tokens_labels property writable

tokens_labels: Sequence[LabelType]

The labels associated with the tokens. This is a read-only property.

tokens_offsets property writable

tokens_offsets: list[OffsetType]

The offsets for each token in the text. This is a read-only property.

labeled_tokens property writable

labeled_tokens: LabeledTokenList

Returns a list of LabeledToken objects joining tokens and tokens_labels with custom visualization.

from_spans classmethod

from_spans(
    text: str,
    spans: list[Span] | list[SpanType],
    tokenizer: str
    | Tokenizer
    | PreTrainedTokenizer
    | PreTrainedTokenizerFast
    | None = None,
    tokenizer_kwargs: dict = {},
    info: InfoDictType = {},
) -> LabeledEntry

Create a LabeledEntry from a text and a list of spans.

Parameters:

Name Type Description Default

text

str

The original text.

required

spans

list[Span] | list[dict[str, str | int | float | None]]

A list or a list of lists of Span items or equivalent dicts containing information about specific spans in text.

required

tokenizer

str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None

A Tokenizer used for tokenization. Supports initialization from a transformers.PreTrainedTokenizer, and uses whitespace tokenization by default.

None

tokenizer_kwargs

dict

Additional arguments for the tokenizer.

{}

info

dict[str, str | int | float | bool]

A dictionary containing additional information about the entry.

{}
Source code in labl/data/labeled_entry.py
@classmethod
def from_spans(
    cls,
    text: str,
    spans: list[Span] | list[SpanType],
    tokenizer: str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None = None,
    tokenizer_kwargs: dict = {},
    info: InfoDictType = {},
) -> "LabeledEntry":
    """Create a `LabeledEntry` from a text and a list of spans.

    Args:
        text (str):
            The original text.
        spans (list[Span] | list[dict[str, str | int | float | None]]):
            A list or a list of lists of `Span` items or equivalent dicts containing information about specific
                spans in `text`.
        tokenizer (str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None): A `Tokenizer`
            used for tokenization. Supports initialization from a `transformers.PreTrainedTokenizer`, and uses
            whitespace tokenization by default.
        tokenizer_kwargs (dict): Additional arguments for the tokenizer.
        info (dict[str, str | int | float | bool]):
            A dictionary containing additional information about the entry.
    """
    tokenizer = get_tokenizer(tokenizer, tokenizer_kwargs)
    spans = Span.from_list(spans)
    tokens, tokens_labels, tokens_offsets = cls.get_tokens_from_spans(text=text, spans=spans, tokenizer=tokenizer)
    tagged = cls.get_tagged_from_spans(text=text, spans=spans)
    return cls(
        text=text,
        spans=spans,
        tagged=tagged,
        tokens=tokens,
        tokens_labels=tokens_labels,
        tokens_offsets=tokens_offsets,
        info=info,
        constructor_key=cls.__constructor_key,
    )

from_tagged classmethod

from_tagged(
    tagged: str,
    tokenizer: str
    | Tokenizer
    | PreTrainedTokenizer
    | PreTrainedTokenizerFast
    | None = None,
    keep_tags: list[str] = [],
    ignore_tags: list[str] = [],
    tokenizer_kwargs: dict = {},
    info: InfoDictType = {},
) -> LabeledEntry

Create a LabeledEntry from a tagged text.

Parameters:

Name Type Description Default

tagged

str

Tagged version of text.

required

tokenizer

str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None

A Tokenizer used for tokenization. Supports initialization from a transformers.PreTrainedTokenizer, and uses whitespace tokenization by default.

None

keep_tags

list[str]

Tag(s) used to mark selected spans, e.g. h for tags like <h>...</h>. If not provided, all tags are kept (Default: []).

[]

ignore_tags

list[str]

Tag(s) that are present in the text but should be ignored while parsing. If not provided, all tags are kept (Default: []).

[]

tokenizer_kwargs

dict

Additional arguments for the tokenizer.

{}

info

dict[str, str | int | float | bool]

A dictionary containing additional information about the entry.

{}
Source code in labl/data/labeled_entry.py
@classmethod
def from_tagged(
    cls,
    tagged: str,
    tokenizer: str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None = None,
    keep_tags: list[str] = [],
    ignore_tags: list[str] = [],
    tokenizer_kwargs: dict = {},
    info: InfoDictType = {},
) -> "LabeledEntry":
    """Create a `LabeledEntry` from a tagged text.

    Args:
        tagged (str): Tagged version of `text`.
        tokenizer (str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None): A `Tokenizer`
            used for tokenization. Supports initialization from a `transformers.PreTrainedTokenizer`, and uses
            whitespace tokenization by default.
        keep_tags (list[str]):
            Tag(s) used to mark selected spans, e.g. `h` for tags like `<h>...</h>`. If not provided, all
            tags are kept (Default: []).
        ignore_tags (list[str]):
            Tag(s) that are present in the text but should be ignored while parsing. If not provided, all tags
            are kept (Default: []).
        tokenizer_kwargs (dict): Additional arguments for the tokenizer.
        info (dict[str, str | int | float | bool]):
            A dictionary containing additional information about the entry.
    """
    tokenizer = get_tokenizer(tokenizer, tokenizer_kwargs)
    text, spans = cls.get_text_and_spans_from_tagged(tagged=tagged, keep_tags=keep_tags, ignore_tags=ignore_tags)
    tokens, tokens_labels, tokens_offsets = cls.get_tokens_from_spans(text=text, spans=spans, tokenizer=tokenizer)
    return cls(
        text=text,
        spans=spans,
        tagged=tagged,
        tokens=tokens,
        tokens_labels=tokens_labels,
        tokens_offsets=tokens_offsets,
        info=info,
        constructor_key=cls.__constructor_key,
    )

from_tokens classmethod

from_tokens(
    tokens: list[str],
    labels: Sequence[LabelType],
    text: str | None = None,
    offsets: list[OffsetType] | None = None,
    keep_labels: list[str] = [],
    ignore_labels: list[str] = [],
    tokenizer: str
    | Tokenizer
    | PreTrainedTokenizer
    | PreTrainedTokenizerFast
    | None = None,
    tokenizer_kwargs: dict = {},
    info: InfoDictType = {},
) -> LabeledEntry

Create a LabeledEntry from a list of tokens.

Parameters:

Name Type Description Default

tokens

list[str] | None

A list of tokens. Can be provided together with labels as an alternative to labeled_tokens.

required

labels

list[str | int | float | None] | None

A list of labels for the tokens. Can be provided together with tokens as an alternative to labeled_tokens.

required

text

str | None

The original text. If not provided, it is detokenized from tokens using the tokenizer.

None

offsets

list[tuple[int, int] | None] | None

The offsets for each token in tokens. The i-th element corresponds to the i-th token in tokens. The offsets are tuples of the form (start, end) corresponding to start and end positions of the token in text. If the token does not exist in text, the offset is None. If not provided, it is computed using the tokenizer.

None

tokenizer

str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None

A Tokenizer used for tokenization. Supports initialization from a transformers.PreTrainedTokenizer, and uses whitespace tokenization by default.

None

keep_labels

list[str]

Label(s) used to mark selected tokens. If not provided, all labels are kept (Default: []).

[]

ignore_labels

list[str]

Label(s) that are present on tokens but should be ignored while parsing. If not provided, all labels are kept (Default: []).

[]

tokenizer_kwargs

dict

Additional arguments for the tokenizer.

{}

info

dict[str, str | int | float | bool]

A dictionary containing additional information about the entry.

{}
Example
from labl.data.labeled_entry import LabeledEntry

entry = LabeledEntry.from_tokens(
    labeled_tokens=[
        ("Apple", "ORG"), ("Inc.", "ORG"), ("is", "O"), ("looking", "O"),
        ("at", "O"), ("buying", "O"), ("U.K.", "LOC"), ("startup", "O"),
        ("for", "O"), ("$1", "MONEY"), ("billion", "MONEY")
    ],
    ignore_labels=["O"],
)
print(entry.tokens)
>>> Apple Inc. is looking at buying U.K. startup for    $1 billion
      ORG  ORG                       LOC             MONEY   MONEY
Source code in labl/data/labeled_entry.py
@classmethod
def from_tokens(
    cls,
    tokens: list[str],
    labels: Sequence[LabelType],
    text: str | None = None,
    offsets: list[OffsetType] | None = None,
    keep_labels: list[str] = [],
    ignore_labels: list[str] = [],
    tokenizer: str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None = None,
    tokenizer_kwargs: dict = {},
    info: InfoDictType = {},
) -> "LabeledEntry":
    """Create a `LabeledEntry` from a list of tokens.

    Args:
        tokens (list[str] | None):
            A list of tokens. Can be provided together with `labels` as an alternative to `labeled_tokens`.
        labels (list[str | int | float | None] | None):
            A list of labels for the tokens. Can be provided together with `tokens` as an alternative to
            `labeled_tokens`.
        text (str | None):
            The original text. If not provided, it is detokenized from `tokens` using the tokenizer.
        offsets (list[tuple[int, int] | None] | None):
            The offsets for each token in `tokens`. The i-th element corresponds to the i-th token in `tokens`.
            The offsets are tuples of the form `(start, end)` corresponding to start and end positions of the
            token in `text`. If the token does not exist in `text`, the offset is `None`. If not provided, it is
            computed using the tokenizer.
        tokenizer (str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None): A `Tokenizer`
            used for tokenization. Supports initialization from a `transformers.PreTrainedTokenizer`, and uses
            whitespace tokenization by default.
        keep_labels (list[str]):
            Label(s) used to mark selected tokens. If not provided, all labels are kept (Default: []).
        ignore_labels (list[str]):
            Label(s) that are present on tokens but should be ignored while parsing. If not provided, all labels
            are kept (Default: []).
        tokenizer_kwargs (dict): Additional arguments for the tokenizer.
        info (dict[str, str | int | float | bool]):
            A dictionary containing additional information about the entry.

    Example:
        ```python
        from labl.data.labeled_entry import LabeledEntry

        entry = LabeledEntry.from_tokens(
            labeled_tokens=[
                ("Apple", "ORG"), ("Inc.", "ORG"), ("is", "O"), ("looking", "O"),
                ("at", "O"), ("buying", "O"), ("U.K.", "LOC"), ("startup", "O"),
                ("for", "O"), ("$1", "MONEY"), ("billion", "MONEY")
            ],
            ignore_labels=["O"],
        )
        print(entry.tokens)
        >>> Apple Inc. is looking at buying U.K. startup for    $1 billion
              ORG  ORG                       LOC             MONEY   MONEY
        ```
    """
    if len(tokens) != len(labels):
        raise RuntimeError("The length of `tokens` and `labels` must be the same. ")
    tokenizer = get_tokenizer(tokenizer, tokenizer_kwargs)
    if text is None:
        text = tokenizer.detokenize(tokens)[0]
    if offsets is None:
        _, all_offsets = tokenizer.tokenize_with_offsets(text)
        offsets = all_offsets[0]
    spans = cls.get_spans_from_tokens(text, labels, offsets, tokenizer, keep_labels, ignore_labels)
    tagged = cls.get_tagged_from_spans(text, spans=spans)
    return cls(
        text=text,
        spans=spans,
        tagged=tagged,
        tokens=tokens,
        tokens_labels=labels,
        tokens_offsets=offsets,
        info=info,
        constructor_key=cls.__constructor_key,
    )

get_tagged_from_spans staticmethod

get_tagged_from_spans(text: str, spans: list[Span]) -> str

Tags one or more texts using lists of spans.

Parameters:

Name Type Description Default

text

str

The text to which tags should be added.

required

spans

list[Span]

The spans to convert to tags.

required

Returns:

Type Description
str

The tagged texts.

Source code in labl/data/labeled_entry.py
@staticmethod
def get_tagged_from_spans(
    text: str,
    spans: list[Span],
) -> str:
    """Tags one or more texts using lists of spans.

    Args:
        text (str): The text to which tags should be added.
        spans (list[Span]): The spans to convert to tags.

    Returns:
        The tagged texts.
    """
    if not spans:
        return text
    tagged = text
    sorted_spans = sorted(spans, key=lambda s: s.start)
    offset = 0
    for s in sorted_spans:
        if s.label:
            start = s.start + offset
            end = s.end + offset
            label = s.label
            tagged = f"{tagged[:start]}<{label}>{tagged[start:end]}</{label}>{tagged[end:]}"

            # Update the offset for the next span
            offset += len(str(label)) * 2 + 5  # <{label}>...</{label}>
    return tagged

get_tokens_from_spans staticmethod

get_tokens_from_spans(
    text: str,
    spans: list[Span],
    tokenizer: Tokenizer | None = None,
) -> tuple[
    list[str], Sequence[LabelType], list[OffsetType]
]

Extracts tokens, labels and offsets from a text and a set of labeled spans.

Parameters:

Name Type Description Default

text

str

The text to which tags should be added.

required

spans

list[Span]

The spans to convert to tokens.

required

tokenizer

Tokenizer | None

A Tokenizer used for text splitting. If not provided, whitespace tokenization is used.

None

Returns:

Type Description
tuple[list[str], Sequence[LabelType], list[OffsetType]]

A tuple (tokens, tokens_labels, tokens_offsets), which are three lists of the same length containing respectively the tokens, their labels and their (start, end) offsets.

Source code in labl/data/labeled_entry.py
@staticmethod
def get_tokens_from_spans(
    text: str,
    spans: list[Span],
    tokenizer: Tokenizer | None = None,
) -> tuple[list[str], Sequence[LabelType], list[OffsetType]]:
    """Extracts tokens, labels and offsets from a text and a set of labeled spans.

    Args:
        text (str): The text to which tags should be added.
        spans (list[Span]): The spans to convert to tokens.
        tokenizer (Tokenizer | None): A `Tokenizer` used for text splitting. If not provided, whitespace
            tokenization is used.

    Returns:
        A tuple `(tokens, tokens_labels, tokens_offsets)`, which are three lists of the same length containing
            respectively the tokens, their labels and their (start, end) offsets.
    """
    if tokenizer is None:
        logger.info("Tokenizer was not provided. Defaulting to whitespace tokenization.")
        tokenizer = WhitespaceTokenizer()
    all_tokens, all_tokens_offsets = tokenizer.tokenize_with_offsets(text)
    tokens, tokens_offsets = all_tokens[0], all_tokens_offsets[0]
    if not spans:
        return tokens, [None] * len(tokens), tokens_offsets
    sorted_spans = sorted(spans, key=lambda s: s.start)

    # Pointer for the current position in sorted_spans
    span_idx = 0
    tokens_labels = []

    for offset in tokens_offsets:
        if offset is None:
            tokens_labels.append(None)
            continue
        token_start, token_end = offset
        label = None

        # Skip spans that end before the token starts
        while span_idx < len(sorted_spans) and sorted_spans[span_idx].end <= token_start:
            span_idx += 1

        # Iterate through spans starting from the current span_idx, as long as
        # the span starts before the current token ends. If a span starts
        # at or after the token ends, it (and all subsequent spans) cannot overlap.
        current_check_idx = span_idx
        while current_check_idx < len(sorted_spans) and sorted_spans[span_idx].start < token_end:
            span = sorted_spans[current_check_idx]

            # Check for actual overlap using the standard condition:
            # Does the interval [token_start, token_end) intersect with [span_start, span_end)?
            # Overlap = max(start1, start2) < min(end1, end2)
            if max(token_start, span.start) < min(token_end, span.end):
                if label is None:
                    label = span.label
                else:
                    label += span.label  # type: ignore
            current_check_idx += 1  # Move to the next potentially overlapping span
        tokens_labels.append(label)
    return tokens, tokens_labels, tokens_offsets

get_text_and_spans_from_tagged staticmethod

get_text_and_spans_from_tagged(
    tagged: str,
    keep_tags: list[str] = [],
    ignore_tags: list[str] = [],
) -> tuple[str, SpanList]

Extract spans and clean text from a tagged string.

Parameters:

Name Type Description Default

tagged

str

The tagged string to extract spans from.

required

keep_tags

list[str]

Tag(s) used to mark selected spans, e.g. <h>...</h>, <error>...</error>. If not provided, all tags are kept (Default: []).

[]

ignore_tags

list[str]

Tag(s) that are present in the text but should be ignored while parsing. If not provided, all tags are kept (Default: []).

[]

Returns:

Type Description
tuple[str, SpanList]

Tuple containing the cleaned text and a list of Span objects.

Source code in labl/data/labeled_entry.py
@staticmethod
def get_text_and_spans_from_tagged(
    tagged: str,
    keep_tags: list[str] = [],
    ignore_tags: list[str] = [],
) -> tuple[str, SpanList]:
    """Extract spans and clean text from a tagged string.

    Args:
        tagged (str): The tagged string to extract spans from.
        keep_tags (list[str]):
            Tag(s) used to mark selected spans, e.g. `<h>...</h>`, `<error>...</error>`. If not provided,
            all tags are kept (Default: []).
        ignore_tags (list[str]):
            Tag(s) that are present in the text but should be ignored while parsing. If not provided,
            all tags are kept (Default: []).

    Returns:
        Tuple containing the cleaned text and a list of `Span` objects.
    """
    any_tag_regex = re.compile(r"<\/?(?:\w+)>")
    if not keep_tags:
        tag_regex = any_tag_regex
    else:
        tag_match_string = "|".join(list(set(keep_tags) | set(ignore_tags)))
        tag_regex = re.compile(rf"<\/?(?:{tag_match_string})>")

    text_without_tags: str = ""
    spans = SpanList()
    current_pos = 0
    open_tags = []
    open_positions = []

    for match in tag_regex.finditer(tagged):
        match_text = match.group(0)
        start, end = match.span()

        # Add text before the tag
        text_without_tags += tagged[current_pos:start]
        current_pos = end

        # Check if opening or closing tag
        if match_text.startswith("</"):
            tag_name = match_text[2:-1]
            if not open_tags or open_tags[-1] != tag_name:
                raise RuntimeError(f"Closing tag {match_text} without matching opening tag")

            # Create span for the highlighted text
            open_pos = open_positions.pop()
            open_tag = open_tags.pop()
            if tag_name not in ignore_tags:
                tagged_span = Span(
                    start=open_pos,
                    end=len(text_without_tags),
                    label=open_tag,
                )
                spans.append(tagged_span)
        else:
            # Opening tag
            tag_name = match_text[1:-1]
            if keep_tags and (tag_name not in keep_tags and tag_name not in ignore_tags):
                raise RuntimeError(
                    f"Unexpected tag type: {tag_name}. "
                    "Specify tag types that should be preserved in the `keep_tags` argument, "
                    "and those that should be ignored in the `ignore_tag_types` argument."
                )
            open_tags.append(tag_name)
            open_positions.append(len(text_without_tags))

    # Add remaining text
    text_without_tags += tagged[current_pos:]
    if open_tags:
        raise RuntimeError(f"Unclosed tags: {', '.join(open_tags)}")

    # If the text contains a tag that was neither kept nor ignored, raise a warning
    unexpected_tags = any_tag_regex.search(text_without_tags)
    if unexpected_tags:
        warn(
            "The text contains tag types that were not specified in keep_tags or ignore_tags: "
            f"{unexpected_tags.group(0)}. These tags are now preserved in the output. If these should ignored "
            "instead, add them to the `ignore_tags` argument.",
            stacklevel=2,
        )
    for span in spans:
        span.text = text_without_tags[span.start : span.end]
    return text_without_tags, spans

get_spans_from_tokens staticmethod

get_spans_from_tokens(
    text: str,
    labels: Sequence[LabelType],
    offsets: list[OffsetType] | None = None,
    tokenizer: Tokenizer | None = None,
    keep_labels: list[str] = [],
    ignore_labels: list[str] = [],
) -> SpanList

Extract spans and clean text from a list of labeled tokens.

Parameters:

Name Type Description Default

text

str

The original text.

required

labels

list[str | int | float | None]

The labels associated with the tokens.

required

offsets

list[tuple[int, int] | None] | None

The offsets for each token in tokens. The i-th element corresponds to the i-th token in tokens.

None

tokenizer

Tokenizer | None

The tokenizer to use for tokenization. If not provided, whitespace tokenization is used.

None

keep_labels

list[str]

Token labels that should be ported over to spans. If not provided, all tags are kept (Default: []).

[]

ignore_labels

list[str]

Token labels that should be ignored while parsing. If not provided, all tags are kept (Default: []).

[]

Returns:

Type Description
SpanList

A list of Span objects corresponding to the labeled tokens.

Source code in labl/data/labeled_entry.py
@staticmethod
def get_spans_from_tokens(
    text: str,
    labels: Sequence[LabelType],
    offsets: list[OffsetType] | None = None,
    tokenizer: Tokenizer | None = None,
    keep_labels: list[str] = [],
    ignore_labels: list[str] = [],
) -> SpanList:
    """Extract spans and clean text from a list of labeled tokens.

    Args:
        text (str):
            The original text.
        labels (list[str | int | float | None]):
            The labels associated with the tokens.
        offsets (list[tuple[int, int] | None] | None):
            The offsets for each token in `tokens`. The i-th element corresponds to the i-th token in `tokens`.
        tokenizer (Tokenizer | None): The tokenizer to use for
            tokenization. If not provided, whitespace tokenization is used.
        keep_labels (list[str]):
            Token labels that should be ported over to spans. If not provided, all tags are kept (Default: []).
        ignore_labels (list[str]):
            Token labels that should be ignored while parsing. If not provided, all tags are kept (Default: []).

    Returns:
        A list of `Span` objects corresponding to the labeled tokens.
    """
    if offsets is None:
        if tokenizer is None:
            logger.info("Tokenizer was not provided. Defaulting to whitespace tokenization.")
            tokenizer = WhitespaceTokenizer()
        _, all_offsets = tokenizer.tokenize_with_offsets(text)
        offsets = all_offsets[0]
    curr_span_label: LabelType = None
    curr_span_start: int | None = None
    curr_span_end: int | None = None
    spans = SpanList()

    # To be considered for a span, a token must have a valid label (not ignored) and a valid character span
    # (not a special token).
    for label, offset in zip(labels, offsets, strict=True):
        is_ignored = label in ignore_labels
        is_kept = not keep_labels or label in keep_labels
        has_valid_label = is_kept and not is_ignored
        if has_valid_label and offset is not None:
            t_start, t_end = offset
            if label == curr_span_label:
                curr_span_end = t_end
            else:
                curr_span_label = label
                curr_span_start = t_start
                curr_span_end = t_end
        else:
            curr_span_label = None
            curr_span_start = None
            curr_span_end = None
    if curr_span_label is not None and curr_span_start is not None and curr_span_end is not None:
        spans.append(Span(start=curr_span_start, end=curr_span_end, label=curr_span_label))
    for span in spans:
        span.text = text[span.start : span.end]
    return spans

get_tokens

get_tokens() -> list[str]
Source code in labl/data/labeled_entry.py
def get_tokens(self) -> list[str]:
    return self.tokens

get_labels

get_labels() -> Sequence[LabelType]
Source code in labl/data/labeled_entry.py
def get_labels(self) -> Sequence[LabelType]:
    return self.tokens_labels

to_dict

to_dict() -> LabeledEntryDictType

Convert the LabeledEntry to a dictionary representation.

Returns:

Type Description
LabeledEntryDictType

A dictionary representation of the LabeledEntry.

Source code in labl/data/labeled_entry.py
def to_dict(self) -> LabeledEntryDictType:
    """Convert the `LabeledEntry` to a dictionary representation.

    Returns:
        A dictionary representation of the `LabeledEntry`.
    """
    return LabeledEntryDictType(
        {
            "_class": self.__class__.__name__,
            "info": self.info,
            "text": self.text,
            "tagged": self.tagged,
            "tokens": self.tokens,
            "tokens_labels": self.tokens_labels,
            "tokens_offsets": self.tokens_offsets,
            "spans": self.spans.to_dict(),
        }
    )

from_dict classmethod

from_dict(data: LabeledEntryDictType) -> LabeledEntry

Create a LabeledEntry from a dictionary representation.

Parameters:

Name Type Description Default

data

dict

A dictionary representation of the LabeledEntry obtained with to_dict().

required

Returns:

Type Description
LabeledEntry

A LabeledEntry object.

Source code in labl/data/labeled_entry.py
@classmethod
def from_dict(cls, data: LabeledEntryDictType) -> "LabeledEntry":
    """Create a `LabeledEntry` from a dictionary representation.

    Args:
        data (dict): A dictionary representation of the `LabeledEntry` obtained with `to_dict()`.

    Returns:
        A `LabeledEntry` object.
    """
    if "_class" not in data:
        raise RuntimeError("The provided dictionary is missing the required _class attribute.")
    if data["_class"] != cls.__name__:
        raise RuntimeError(f"Cannot load a {cls.__name__} object from {data['_class']}")
    return cls(
        text=data["text"],
        spans=Span.from_list(data["spans"]),
        tagged=data["tagged"],
        tokens=data["tokens"],
        tokens_labels=data["tokens_labels"],
        tokens_offsets=data["tokens_offsets"],
        info=data["info"],
        constructor_key=cls.__constructor_key,
    )

labl.data.edited_entry.EditedEntry

EditedEntry(
    orig: LabeledEntry,
    edit: LabeledEntry,
    has_gaps: bool,
    has_bos_token: bool,
    has_eos_token: bool,
    aligned: WordOutput | None = None,
    info: InfoDictType = {},
    constructor_key: object | None = None,
)

Bases: BaseLabeledEntry

Class for a pair of text entries (orig and edit) where word-level annotations are obtained from the aligned tokens of the two entries.

Attributes:

Name Type Description
orig LabeledEntry

The original entry.

edit LabeledEntry

The edited entry.

has_gaps bool

Whether the token sequence has gaps. Gaps are used for text/edit pairs to mark the positions of insertions and deletions in the original/edited texts, respectively. If False, it means gap annotations were merged to the next token to the right.

has_bos_token bool

Whether the tokenizer has a beginning-of-sequence token.

has_eos_token bool

Whether the tokenizer has an end-of-sequence token.

aligned WordOutput | None

A jiwer.WordOutput with aligned tokens for orig and edit, using tokenized the provided tokenizer.

info dict[str, str | int | float | bool]

A dictionary containing additional information about the entry.

One or more EditedEntry can be initialized from a text and one or more edits, e.g. Hello world! and ["Goodbye world!", "Hello planet!"], using EditedEntry.from_edits(text=..., edits=...).

Source code in labl/data/edited_entry.py
def __init__(
    self,
    orig: LabeledEntry,
    edit: LabeledEntry,
    has_gaps: bool,
    has_bos_token: bool,
    has_eos_token: bool,
    aligned: WordOutput | None = None,
    info: InfoDictType = {},
    constructor_key: object | None = None,
):
    """Private constructor for `EditedEntry`.

    One or more `EditedEntry` can be initialized from a  `text` and one or more `edits`, e.g. `Hello world!` and
        `["Goodbye world!", "Hello planet!"]`, using `EditedEntry.from_edits(text=..., edits=...)`.
    """
    if constructor_key != self.__constructor_key:
        raise RuntimeError(
            dedent("""\
            The default constructor for `EditedEntry` is private. One or more `EditedEntry` can be initialized from
            a  `text` and one or more `edits`, e.g. `Hello world!` and `["Goodbye world!", "Hello planet!"]`, using
             `EditedEntry.from_edits(text=..., edits=...)`.
            """)
        )
    self._orig = orig
    self._edit = edit
    self._aligned = aligned
    self._has_gaps = has_gaps
    self._has_bos_token = has_bos_token
    self._has_eos_token = has_eos_token
    self._label_types = list(set(self._orig._label_types) | set(self._edit._label_types))
    self._info = info

orig property writable

The LabeledEntry for the original text.

edit property writable

The LabeledEntry entry for the edited text.

aligned property writable

aligned: WordOutput | None

Aligned output using jiwer for orig and edit.

has_gaps property writable

has_gaps: bool

Boolean flag marking whether the token sequence has added gaps for insertion/deletion annotations.

has_bos_token property writable

has_bos_token: bool

Boolean flag marking whether the tokenizer has a beginning-of-sequence token.

has_eos_token property writable

has_eos_token: bool

Boolean flag marking whether the tokenizer has an end-of-sequence token.

aligned_str property writable

aligned_str: str

Aligned string at the token level with jiwer.visualize_alignment.

from_edits classmethod

from_edits(
    text: str,
    edits: str | list[str],
    tokenizer: str
    | Tokenizer
    | PreTrainedTokenizer
    | PreTrainedTokenizerFast
    | None = None,
    tokenizer_kwargs: dict = {},
    with_gaps: bool = True,
    keep_final_gap: bool = True,
    sub_label: str = "S",
    ins_label: str = "I",
    del_label: str = "D",
    gap_token: str = "▁",
    info: InfoDictType | list[InfoDictType] = {},
) -> EditedEntry | MultiEditEntry

Create a EditedEntry or an MultiEditEntry from a text and one or more edits.

Parameters:

Name Type Description Default

text

str

The original text.

required

edits

str | list[str] | None

One or more edited version of the text.

required

tokenizer

str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None

A Tokenizer used for tokenization. Supports initialization from a transformers.PreTrainedTokenizer, and uses whitespace tokenization by default.

None

tokenizer_kwargs

dict

Additional arguments for the tokenizer.

{}

with_gaps

bool

Whether to add gaps to the tokens and offsets. Gaps are used to mark the positions of insertions and deletions in the original/edited texts, respectively. If false, those are merged to the next token to the right. Default: True.

True

keep_final_gap

bool

Whether to keep the final gap token. Default: True.

True

sub_label

str

The label for substitutions. Default: "S".

'S'

ins_label

str

The label for insertions. Default: "I".

'I'

del_label

str

The label for deletions. Default: "D".

'D'

gap_token

str

The token to use for gaps. Default: "▁".

'▁'

info

dict[str, str | int | float | bool] | list[dict[str, str | int | float | bool]]

A dictionary containing additional information about the entry.

{}

Returns:

Type Description
EditedEntry | MultiEditEntry

A single EditedEntry if edits is a single string, otherwise an MultiEditEntry with one entry per edit.

Example
from labl.data.edited_entry import EditedEntry

entries = EditedEntry.from_edits(
    text="a simple example",
    edits=["this is a simple enough test, you know?", "an example"],
    tokenizer="facebook/nllb-200-3.3B",
    tokenizer_kwargs={
        "tgt_lang": "ita_Latn",
        "add_special_tokens": True,
    },
)
print(entries[0].aligned_str)
>>> ORIG: ita_Latn ***** *** a simple ******* ***** * **** ***** example </s>
    EDIT: ita_Latn this is a simple enough test , you know        ? </s>
                       I   I                  I     I I    I     I        S
Source code in labl/data/edited_entry.py
@classmethod
def from_edits(
    cls,
    text: str,
    edits: str | list[str],
    tokenizer: str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None = None,
    tokenizer_kwargs: dict = {},
    with_gaps: bool = True,
    keep_final_gap: bool = True,
    sub_label: str = "S",
    ins_label: str = "I",
    del_label: str = "D",
    gap_token: str = "▁",
    info: InfoDictType | list[InfoDictType] = {},
) -> "EditedEntry | MultiEditEntry":
    """Create a `EditedEntry` or an `MultiEditEntry` from a text and one or more edits.

    Args:
        text (str): The original text.
        edits (str | list[str] | None): One or more edited version of the text.
        tokenizer (str | Tokenizer | PreTrainedTokenizer | PreTrainedTokenizerFast | None): A `Tokenizer`
            used for tokenization. Supports initialization from a `transformers.PreTrainedTokenizer`, and uses
            whitespace tokenization by default.
        tokenizer_kwargs (dict): Additional arguments for the tokenizer.
        with_gaps (bool): Whether to add gaps to the tokens and offsets. Gaps are used to mark the positions of
            insertions and deletions in the original/edited texts, respectively. If false, those are merged to the
            next token to the right. Default: True.
        keep_final_gap (bool): Whether to keep the final gap token. Default: True.
        sub_label (str): The label for substitutions. Default: "S".
        ins_label (str): The label for insertions. Default: "I".
        del_label (str): The label for deletions. Default: "D".
        gap_token (str): The token to use for gaps. Default: "▁".
        info (dict[str, str | int | float | bool] | list[dict[str, str | int | float | bool]]):
            A dictionary containing additional information about the entry.

    Returns:
        A single `EditedEntry` if `edits` is a single string, otherwise an `MultiEditEntry` with one entry per
            edit.

    Example:
        ```python
        from labl.data.edited_entry import EditedEntry

        entries = EditedEntry.from_edits(
            text="a simple example",
            edits=["this is a simple enough test, you know?", "an example"],
            tokenizer="facebook/nllb-200-3.3B",
            tokenizer_kwargs={
                "tgt_lang": "ita_Latn",
                "add_special_tokens": True,
            },
        )
        print(entries[0].aligned_str)
        >>> ORIG: ita_Latn ***** *** ▁a ▁simple ******* ***** * **** ***** ▁example </s>
            EDIT: ita_Latn ▁this ▁is ▁a ▁simple ▁enough ▁test , ▁you ▁know        ? </s>
                               I   I                  I     I I    I     I        S
        ```
    """
    edits = [edits] if isinstance(edits, str) else edits
    if isinstance(info, list) and len(edits) != len(info):
        raise RuntimeError(
            f"The number of edits ({len(edits)}) does not match the number of info dictionaries ({len(info)})."
        )
    tokenizer = get_tokenizer(tokenizer, tokenizer_kwargs)
    tokens, offsets = tokenizer.tokenize_with_offsets(text)
    tokens_with_gaps, offsets_with_gaps = tokenizer._add_gaps_to_tokens_and_offsets(tokens, offsets, gap_token)
    tokens, offsets = tokens[0], offsets[0]
    tokens_with_gaps, offsets_with_gaps = tokens_with_gaps[0], offsets_with_gaps[0]
    all_edits_tokens, all_edits_offsets = tokenizer.tokenize_with_offsets(edits)
    all_edits_tokens_with_gaps, all_edits_offsets_with_gaps = tokenizer._add_gaps_to_tokens_and_offsets(
        all_edits_tokens, all_edits_offsets, gap_token=gap_token
    )
    entries = MultiEditEntry(info=info if isinstance(info, dict) else {})
    all_info_dicts = info if isinstance(info, list) else [info] * len(edits)
    for edit, e_tokens, e_offsets, e_tokens_with_gaps, e_offsets_with_gaps, e_info in zip(
        edits,
        all_edits_tokens,
        all_edits_offsets,
        all_edits_tokens_with_gaps,
        all_edits_offsets_with_gaps,
        all_info_dicts,
        strict=True,
    ):
        aligned = process_words(
            texts=[tokens], edits=[e_tokens], is_text_pre_transformed=True, is_edit_pre_transformed=True
        )
        tokens_labels, e_tokens_labels = cls.get_tokens_labels_from_edit(
            text=text,
            edit=edit,
            tokens=tokens_with_gaps,
            tokens_offsets=offsets_with_gaps,
            edit_tokens=e_tokens_with_gaps,
            edit_tokens_offsets=e_offsets_with_gaps,
            aligned=aligned,
            tokenizer=tokenizer,
            sub_label=sub_label,
            ins_label=ins_label,
            del_label=del_label,
            gap_token=gap_token,
        )
        if with_gaps:
            out_tokens = tokens_with_gaps
            out_offsets = offsets_with_gaps
            out_edit_tokens = e_tokens_with_gaps
            out_edit_offsets = e_offsets_with_gaps
        else:
            # If an ad-hoc EOS is added, it is always kept
            if tokenizer.has_eos_token:
                if not keep_final_gap:
                    raise RuntimeError(
                        "The tokenizer has an EOS token, but `keep_final_gap` is set to False."
                        "The EOS token will be kept."
                    )
            tokens_labels = tokenizer._merge_gap_annotations(
                [tokens_labels], has_bos_token=tokenizer.has_bos_token, keep_final_gap=keep_final_gap
            )[0]
            e_tokens_labels = tokenizer._merge_gap_annotations(
                [e_tokens_labels], has_bos_token=tokenizer.has_bos_token, keep_final_gap=keep_final_gap
            )[0]

            # If gaps are merged, the last gap is kept regardless of it being a gap or not to mark end-insertions.
            # If the tokenizer did not have an EOS token for that, the sequence will have an extra token and offsets
            # will need to be adjusted.
            if not keep_final_gap:
                out_tokens, out_offsets = tokens, offsets
                out_edit_tokens, out_edit_offsets = e_tokens, e_offsets
            else:
                out_tokens, out_offsets = tokens + [gap_token], offsets + [None]
                out_edit_tokens, out_edit_offsets = e_tokens + [gap_token], e_offsets + [None]
        entry = EditedEntry(
            orig=LabeledEntry.from_tokens(
                tokens=out_tokens,
                labels=tokens_labels,
                text=text,
                offsets=out_offsets,
                tokenizer=tokenizer,
            ),
            edit=LabeledEntry.from_tokens(
                tokens=out_edit_tokens,
                labels=e_tokens_labels,
                text=edit,
                offsets=out_edit_offsets,
                tokenizer=tokenizer,
            ),
            aligned=aligned,
            has_gaps=with_gaps,
            has_bos_token=tokenizer.has_bos_token,
            has_eos_token=tokenizer.has_eos_token,
            info=e_info,
            constructor_key=cls.__constructor_key,
        )
        entries.append(entry)
    if len(entries) == 1:
        return entries[0]
    entries._label_types = entries._get_label_types()
    return entries

get_tokens_labels_from_edit classmethod

get_tokens_labels_from_edit(
    text: str,
    edit: str,
    tokens: list[str] | None = None,
    tokens_offsets: list[OffsetType] | None = None,
    edit_tokens: list[str] | None = None,
    edit_tokens_offsets: list[OffsetType] | None = None,
    aligned: WordOutput | None = None,
    tokenizer: Tokenizer | None = None,
    sub_label: str = "S",
    ins_label: str = "I",
    del_label: str = "D",
    gap_token: str = "▁",
) -> tuple[Sequence[str | None], Sequence[str | None]]

Convert text edits to token labels marking insertions, deletions and substitutions. The returned labels include gaps before/after each token, which can be merged to the right to match the original token sequence.

Parameters:

Name Type Description Default

text

str

The original text.

required

edit

str

The edited text.

required

tokens

list[str] | None

The tokenized version of text. If not provided, it will be computed using tokenzier. Default: None.

None

tokens_offsets

list[tuple[int, int] | None]

The offsets of tokens in text. If not provided, it will be computed using tokenzier. Default: None.

None

edit_tokens

list[str] | None

The tokenized version of edit. If not provided, it will be computed using tokenzier. Default: None.

None

edit_tokens_offsets

list[tuple[int, int] | None]

The offsets of edit_tokens in edit. If not provided, it will be computed using tokenzier. Default: None.

None

aligned

WordOutput | None

The aligned WordOutput between text and edit. If not provided, it will be obtained automatically using tokenizer for spltting. Default: None.

None

tokenizer

Tokenizer | None

A Tokenizer used for text splitting. If not provided, whitespace tokenization is used by default. Default: None.

None

sub_label

str

The label for substitutions. Default: "S".

'S'

ins_label

str

The label for insertions. Default: "I".

'I'

del_label

str

The label for deletions. Default: "D".

'D'

gap_token

str

The token to use for gaps. Default: "▁".

'▁'

Returns:

Type Description
tuple[Sequence[str | None], Sequence[str | None]]

A tuple containing two lists of labels (one for text, one for edit). Each label can be either None (if the token was not edited) or one of sub_label, ins_label or del_label depending on the type of operation associated with the token.

Source code in labl/data/edited_entry.py
@classmethod
def get_tokens_labels_from_edit(
    cls,
    text: str,
    edit: str,
    tokens: list[str] | None = None,
    tokens_offsets: list[OffsetType] | None = None,
    edit_tokens: list[str] | None = None,
    edit_tokens_offsets: list[OffsetType] | None = None,
    aligned: WordOutput | None = None,
    tokenizer: Tokenizer | None = None,
    sub_label: str = "S",
    ins_label: str = "I",
    del_label: str = "D",
    gap_token: str = "▁",
) -> tuple[Sequence[str | None], Sequence[str | None]]:
    """Convert text edits to token labels marking insertions, deletions and substitutions. The returned labels
    include gaps before/after each token, which can be merged to the right to match the original token sequence.

    Args:
        text (str): The original text.
        edit (str): The edited text.
        tokens (list[str] | None): The tokenized version of `text`. If not provided, it will be computed using
            `tokenzier`. Default: `None`.
        tokens_offsets (list[tuple[int, int] | None]): The offsets of `tokens` in `text`. If not provided, it will
            be computed using `tokenzier`. Default: `None`.
        edit_tokens (list[str] | None): The tokenized version of `edit`. If not provided, it will be computed using
            `tokenzier`. Default: `None`.
        edit_tokens_offsets (list[tuple[int, int] | None]): The offsets of `edit_tokens` in `edit`. If not
            provided, it will be computed using `tokenzier`. Default: `None`.
        aligned (WordOutput | None): The aligned `WordOutput` between `text` and `edit`. If not provided, it will
            be obtained automatically using `tokenizer` for spltting. Default: `None`.
        tokenizer (Tokenizer | None): A `Tokenizer` used for text splitting. If not provided, whitespace
            tokenization is used by default. Default: `None`.
        sub_label (str): The label for substitutions. Default: "S".
        ins_label (str): The label for insertions. Default: "I".
        del_label (str): The label for deletions. Default: "D".
        gap_token (str): The token to use for gaps. Default: "▁".

    Returns:
        A tuple containing two lists of labels (one for `text`, one for `edit`). Each label can be either None
            (if the token was not edited) or one of `sub_label`, `ins_label` or `del_label` depending on the type
            of operation associated with the token.
    """
    if tokenizer is None:
        logger.info("Tokenizer was not provided. Defaulting to whitespace tokenization.")
        tokenizer = WhitespaceTokenizer()
    if aligned is None:
        aligned = process_words(
            text, edit, texts_transform=tokenizer.transform, edits_transform=tokenizer.transform
        )
    tokens, tokens_offsets = cls._get_tokens_with_gaps(tokenizer, text, tokens, tokens_offsets, gap_token)
    edit_tokens, edit_tokens_offsets = cls._get_tokens_with_gaps(
        tokenizer, edit, edit_tokens, edit_tokens_offsets, gap_token
    )
    tokens_labels: list[str | None] = [None] * len(tokens)
    edit_tokens_labels: list[str | None] = [None] * len(edit_tokens)
    for alignment in aligned.alignments[0]:
        text_start_idx = alignment.ref_start_idx
        text_end_idx = alignment.ref_end_idx
        edit_start_idx = alignment.hyp_start_idx
        edit_end_idx = alignment.hyp_end_idx
        if tokenizer.has_bos_token:
            text_start_idx -= 1
            text_end_idx -= 1
            edit_start_idx -= 1
            edit_end_idx -= 1
        if alignment.type == "insert":
            tokens_labels[text_start_idx * 2] = ins_label
        elif alignment.type in ("delete", "substitute"):
            label = sub_label if alignment.type == "substitute" else del_label
            for idx in range(text_start_idx, text_end_idx):
                tokens_labels[idx * 2 + 1] = label
        if alignment.type == "delete":
            edit_tokens_labels[edit_start_idx * 2] = del_label
        elif alignment.type in ("insert", "substitute"):
            label = sub_label if alignment.type == "substitute" else ins_label
            for idx in range(edit_start_idx, edit_end_idx):
                edit_tokens_labels[idx * 2 + 1] = label
    return tokens_labels, edit_tokens_labels

get_tokens

get_tokens() -> list[str]
Source code in labl/data/edited_entry.py
def get_tokens(self) -> list[str]:
    return self.orig.tokens

get_labels

get_labels() -> Sequence[LabelType]
Source code in labl/data/edited_entry.py
def get_labels(self) -> Sequence[LabelType]:
    return self.orig.tokens_labels

to_dict

to_dict() -> EditedEntryDictType

Convert the EditedEntry to a dictionary representation.

Returns:

Type Description
EditedEntryDictType

A dictionary representation of the EditedEntry.

Source code in labl/data/edited_entry.py
def to_dict(self) -> EditedEntryDictType:
    """Convert the `EditedEntry` to a dictionary representation.

    Returns:
        A dictionary representation of the `EditedEntry`.
    """
    return EditedEntryDictType(
        {
            "_class": self.__class__.__name__,
            "info": self.info,
            "orig": self.orig.to_dict(),
            "edit": self.edit.to_dict(),
            "has_bos_token": self.has_bos_token,
            "has_eos_token": self.has_eos_token,
            "has_gaps": self.has_gaps,
        }
    )

from_dict classmethod

from_dict(data: EditedEntryDictType) -> EditedEntry

Create a EditedEntry from a dictionary representation.

Parameters:

Name Type Description Default

data

dict

A dictionary representation of the EditedEntry obtained with to_dict().

required

Returns:

Type Description
EditedEntry

A EditedEntry object.

Source code in labl/data/edited_entry.py
@classmethod
def from_dict(cls, data: EditedEntryDictType) -> "EditedEntry":
    """Create a `EditedEntry` from a dictionary representation.

    Args:
        data (dict): A dictionary representation of the `EditedEntry` obtained with `to_dict()`.

    Returns:
        A `EditedEntry` object.
    """
    if "_class" not in data:
        raise RuntimeError("The provided dictionary is missing the required _class attribute.")
    if data["_class"] != cls.__name__:
        raise RuntimeError(f"Cannot load a {cls.__name__} object from {data['_class']}")
    return cls(
        orig=LabeledEntry.from_dict(data["orig"]),
        edit=LabeledEntry.from_dict(data["edit"]),
        has_bos_token=data["has_bos_token"],
        has_eos_token=data["has_eos_token"],
        has_gaps=data["has_gaps"],
        info=data["info"],
        constructor_key=cls.__constructor_key,
    )

merge_gap_annotations

merge_gap_annotations(
    merge_fn: Callable[[Sequence[LabelType]], LabelType]
    | None = None,
    keep_final_gap: bool = True,
) -> None

Merge gap annotations in the tokens of orig and edit.

This method is equivalent to calling EditedEntry.from_edits with with_gaps=False. Gap annotations are merged to the next non-gap token to the right, and the gap label is added to the label of the non-gap token. The last gap is kept to account for insertions at the end of the text.

E.g. GAP Hello GAP World GAP ! GAP becomes Hello World ! GAP. I S I I IS I I

Source code in labl/data/edited_entry.py
def merge_gap_annotations(
    self,
    merge_fn: Callable[[Sequence[LabelType]], LabelType] | None = None,
    keep_final_gap: bool = True,
) -> None:
    """Merge gap annotations in the tokens of `orig` and `edit`.

    This method is equivalent to calling `EditedEntry.from_edits` with `with_gaps=False`. Gap annotations are merged
    to the next non-gap token to the right, and the gap label is added to the label of the non-gap token. The last
    gap is kept to account for insertions at the end of the text.

    E.g. `GAP Hello GAP World GAP ! GAP` becomes `Hello World ! GAP`.
         `  I     S   I               I`         `   IS     I     I`
    """
    if not self._has_gaps:
        raise RuntimeError("Gaps for the current entry were already merged.")
    has_bos = self._has_bos_token
    if self.has_eos_token:
        if not keep_final_gap:
            raise RuntimeError(
                "The tokenizer has an EOS token, but `keep_final_gap` is set to False. The EOS token will be kept."
            )
    o_tok, o_lab, o_off = self._orig._tokens, self._orig._tokens_labels, self._orig._tokens_offsets
    e_tok, e_lab, e_off = self._edit._tokens, self._edit._tokens_labels, self._edit._tokens_offsets
    self._orig._tokens = Tokenizer._remove_gap_tokens([o_tok], self._has_bos_token, keep_final_gap)[0]
    self._edit._tokens = Tokenizer._remove_gap_tokens([e_tok], self._has_bos_token, keep_final_gap)[0]
    self._orig._tokens_labels = Tokenizer._merge_gap_annotations([o_lab], merge_fn, has_bos, keep_final_gap)[0]
    self._edit._tokens_labels = Tokenizer._merge_gap_annotations([e_lab], merge_fn, has_bos, keep_final_gap)[0]
    self._orig._tokens_offsets = Tokenizer._remove_gap_offsets([o_off], self._has_bos_token, keep_final_gap)[0]
    self._edit._tokens_offsets = Tokenizer._remove_gap_offsets([e_off], self._has_bos_token, keep_final_gap)[0]
    self._has_gaps = False
    self._has_eos_token = keep_final_gap