Source code for ls_mlkit.util.llm

import numpy as np
import torch
from transformers import EvalPrediction



[docs]
def get_data_collator(tokenizer, max_length=-1, ignore_masked_token=True, model_type="causal"):
    """Get a data collator for a model,
    Shifting the inputs and labels to align them happens inside the model,
    so the data collator just copies the inputs to create the labels.


    Args:
        tokenizer (Tokenizer): the tokenizer to use
        max_length (int, optional): the maximum length of the input text. Defaults to -1.
        ignore_masked_token (bool, optional): whether to ignore the masked tokens. Defaults to True.
        model_type (str, optional): the type of the model. Defaults to "causal".

    Returns:
        callable: the data collator
    """

    def huggingface_causal_collator(batch):
        ignore_index = -100
        collated_batch = list()
        for sample in batch:
            collated_batch.append(sample["x"] + " " + sample["y"] + tokenizer.eos_token)
        encodings = tokenizer(
            collated_batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_length if max_length != -1 else None,
        )
        # tokens_list = [tokenizer.tokenize(text) for text in collated_batch]
        # print(f"Tokens list: {tokens_list}")
        # Calculate input text length in tokens
        input_text_length_list = list()
        for sample in batch:
            input_text_length_list.append(len(tokenizer(sample["x"], return_tensors="pt")["input_ids"][0]))
        labels = encodings["input_ids"].clone()
        for i, l in enumerate(input_text_length_list):
            labels[i, :l] = ignore_index
        if ignore_masked_token:
            # The attention mask here should be padding mask.
            # 1 indicates valid tokens and 0 indicates padding in huggingface.
            # This convention is the opposite of how the Pytorch Transformers library uses attention_mask.
            labels[encodings["attention_mask"] == 0] = ignore_index
        results = {
            "input_ids": encodings["input_ids"],
            "attention_mask": encodings["attention_mask"],
            "labels": labels,
        }
        return results

    def huggingface_autoregressive_collator(batch):
        ignore_index = -100
        collated_batch = list()
        for sample in batch:
            collated_batch.append(sample["x"] + " " + sample["y"] + tokenizer.eos_token)
        encodings = tokenizer(
            collated_batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_length if max_length != -1 else None,
        )
        labels = encodings["input_ids"].clone()
        if ignore_masked_token:
            # The attention mask here should be padding mask.
            # 1 indicates valid tokens and 0 indicates padding in huggingface.
            # This convention is the opposite of how the Pytorch Transformers library uses attention_mask.
            labels[encodings["attention_mask"] == 0] = ignore_index
        results = {
            "input_ids": encodings["input_ids"],
            "attention_mask": encodings["attention_mask"],
            "labels": labels,
        }
        return results

    def huggingface_seq2seq_collator(batch):
        ignore_index = -100
        inputs = tokenizer(
            batch["x"],
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_length if max_length != -1 else None,
        )
        outputs = tokenizer(
            batch["y"],
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_length if max_length != -1 else None,
        )

        results = {
            "input_ids": inputs["input_ids"],
            "attention_mask": inputs["attention_mask"],
            "labels": outputs["input_ids"],
            "decoder_attention_mask": outputs["attention_mask"],
        }

        if ignore_masked_token:
            # The attention mask here should be padding mask.
            # 1 indicates valid tokens and 0 indicates padding in huggingface.
            # This convention is the opposite of how the Pytorch Transformers library uses attention_mask.
            results["labels"][outputs["attention_mask"] == 0] = ignore_index

        return results

    match model_type:
        case "causal":
            return huggingface_causal_collator
        case "seq2seq":
            return huggingface_seq2seq_collator
        case "autoregressive":
            return huggingface_autoregressive_collator
        case _:
            raise ValueError(f"Unsupported model type: {model_type}")




[docs]
def compute_metrics(eval_prediction: EvalPrediction):
    """Compute the metrics for a model

    Args:
        eval_prediction (EvalPrediction): the evaluation prediction

    Returns:
        dict: the metrics
    """

    predictions, _labels = eval_prediction.predictions
    predictions = predictions[:, :-1]
    labels = eval_prediction.label_ids[:, 1:]

    predictions = predictions.flatten().astype(np.int32)
    labels = labels.flatten().astype(np.int32)

    correct_predictions = np.sum((predictions == labels))
    total_samples = len(labels)

    accuracy = correct_predictions / total_samples
    return {
        "accuracy": accuracy,
    }




[docs]
def preprocess_logits_for_metrics(logits, labels):
    """Preprocess the logits for metrics

    Args:
        logits (torch.Tensor): the logits
        labels (torch.Tensor): the ground truth labels

    Returns:
        tuple: the prediction ids and the ground truth labels
    """
    # import traceback
    # traceback.print_stack()
    if isinstance(logits, tuple):
        logits = logits[0]
    prediction_ids = torch.argmax(logits, dim=-1)
    return prediction_ids, labels




[docs]
def add_maybe_special_tokens(model, tokenizer):
    """Add maybe special tokens to the tokenizer and model, resize the model's token embeddings if needed

    Args:
        model (torch.nn.Module): the model to add the special tokens
        tokenizer (Tokenizer): the tokenizer to add the special tokens

    Returns:
        tuple: the model and the tokenizer
    """
    if tokenizer.eos_token is None:
        tokenizer.add_special_tokens({"eos_token": "<|endoftext|>"})
        model.resize_token_embeddings(len(tokenizer))
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    return model, tokenizer
Source code for ls_mlkit.util.llm

ls-mlkit

Navigation

Related Topics